diff -ur htmldoc-1.8.23.orig/htmldoc/htmldoc.cxx htmldoc-1.8.23/htmldoc/htmldoc.cxx --- htmldoc-1.8.23.orig/htmldoc/htmldoc.cxx 2002-10-25 16:37:54.000000000 +0300 +++ htmldoc-1.8.23/htmldoc/htmldoc.cxx 2003-08-20 21:14:41.000000000 +0300 @@ -83,6 +83,12 @@ static void term_handler(int signum); static void usage(void); +/* + * Local variables... + */ + +int autocharset = 0; + /* * 'main()' - Main entry for HTMLDOC. @@ -247,7 +253,13 @@ { i ++; if (i < argc) - htmlSetCharSet(argv[i]); + { + if (strcmp(argv[i], "auto") == 0) + { + autocharset = 1; + } + else htmlSetCharSet(argv[i]); + } else usage(); } @@ -1034,6 +1046,9 @@ while (document->prev != NULL) document = document->prev; + + if (autocharset) + htmlSetMetaCharSet(document); htmlDebugStats("Document Tree", document); @@ -1269,7 +1284,13 @@ else if (strncasecmp(line, "XRXCOMMENTS=", 12) == 0) XRXComments = atoi(line + 12); else if (strncasecmp(line, "CHARSET=", 8) == 0) - htmlSetCharSet(line + 8); + { + if (strcmp(line + 8, "auto") == 0) + { + autocharset = 1; + } + else htmlSetCharSet(line + 8); + } else if (strncasecmp(line, "PAGEMODE=", 9) == 0) PDFPageMode = atoi(line + 9); else if (strncasecmp(line, "PAGELAYOUT=", 11) == 0) @@ -1994,7 +2015,13 @@ } } else if (strcmp(temp, "--charset") == 0) - htmlSetCharSet(temp2); + { + if (strcmp(temp2, "auto") == 0) + { + autocharset = 1; + } + else htmlSetCharSet(temp2); + } else if (strcmp(temp, "--pagemode") == 0) { for (i = 0; i < (int)(sizeof(PDFModes) / sizeof(PDFModes[0])); i ++) diff -ur htmldoc-1.8.23.orig/htmldoc/html.h htmldoc-1.8.23/htmldoc/html.h --- htmldoc-1.8.23.orig/htmldoc/html.h 2002-07-27 06:41:31.000000000 +0300 +++ htmldoc-1.8.23/htmldoc/html.h 2003-08-20 20:55:23.000000000 +0300 @@ -309,6 +309,7 @@ extern void htmlSetBaseSize(float p, float s); extern void htmlSetCharSet(const char *cs); +extern int htmlSetMetaCharSet(tree_t *tree); extern void htmlSetTextColor(uchar *color); extern void htmlDebugStats(const char *title, tree_t *t); diff -ur htmldoc-1.8.23.orig/htmldoc/htmllib.cxx htmldoc-1.8.23/htmldoc/htmllib.cxx --- htmldoc-1.8.23.orig/htmldoc/htmllib.cxx 2002-10-11 17:23:28.000000000 +0300 +++ htmldoc-1.8.23/htmldoc/htmllib.cxx 2003-08-20 21:56:21.000000000 +0300 @@ -1854,6 +1854,73 @@ return (NULL); } +/* + * 'htmlGetMetaCharSet()' - Get document charset from "meta" data... + */ + +int +htmlSetMetaCharSet(tree_t *tree) /* I - Document tree */ +{ + uchar *tname, /* Name value from tree entry */ + *tcontent, /* Content value from tree entry */ + *tchar, *tchar2; + uchar charset[256]; + + while (tree != NULL) + { + /* + * Check this tree entry... + */ + if (tree->markup == MARKUP_META) + { + if ((tname = htmlGetVariable(tree, (uchar *)"HTTP-EQUIV")) != NULL && + (tcontent = htmlGetVariable(tree, (uchar *)"CONTENT")) != NULL) + if (strcasecmp((char *)tname, "Content-Type") == 0) + if ((tchar = (uchar *)strstr((char *)tcontent, "charset=")) != NULL || + (tchar = (uchar *)strstr((char *)tcontent, "CHARSET=")) != NULL) + { + + tchar += 8; + for (tchar2 = charset; (*tchar >= (uchar)'a' && *tchar <= (uchar)'z') || + (*tchar >= (uchar)'A' && *tchar <= (uchar)'Z') || + (*tchar >= (uchar)'0' && *tchar <= (uchar)'9') || + *tchar == (uchar)'_' || *tchar == (uchar)'-';) + *tchar2++ = *tchar++; + *tchar2 = (uchar)'\0'; + for (tchar2 = charset; *tchar2 != (uchar)'\0'; tchar2++) + *tchar2 = (uchar)tolower((char)*tchar2); + htmlSetCharSet((char *)charset); + return 1; + } + if ((tname = htmlGetVariable(tree, (uchar *)"CHARSET")) != NULL) + { + tchar = tname; + for (tchar2 = charset; *tchar != (uchar)'\0';) + *tchar2++ = (uchar)tolower((char)*tchar++); + *tchar2 = (uchar)'\0'; + htmlSetCharSet((char *)charset); + return 1; + } + } + + /* + * Check child entries... + */ + + if (tree->child != NULL) + if (htmlSetMetaCharSet(tree->child)) + return 1; + + /* + * Next tree entry... + */ + + tree = tree->next; + } + + return 0; +} + /* * 'htmlGetStyle()' - Get a style value from a node's STYLE attribute.