Index: ext/standard/html.c =================================================================== RCS file: /repository/php-src/ext/standard/html.c,v retrieving revision 1.111.2.2 diff -u -r1.111.2.2 html.c --- ext/standard/html.c 25 Feb 2006 21:32:11 -0000 1.111.2.2 +++ ext/standard/html.c 22 Sep 2006 22:23:02 -0000 @@ -379,38 +379,87 @@ "#733", "#731", "#711" }; -struct html_entity_map { +typedef struct html_entity_map { enum entity_charset charset; /* charset identifier */ - unsigned short basechar; /* char code at start of table */ - unsigned short endchar; /* last char code in the table */ - entity_table_t *table; /* the table of mappings */ -}; + struct _entity_list { + unsigned short basechar; /* char code at start of table */ + unsigned short endchar; /* last char code in the table */ + entity_table_t *table; /* the table of mappings */ + } entity_list[16]; +} html_entity_map; + +#define HTML_ENTITY_MAP(a, b, c) { (a) , (b), (c) } +#define HTML_ENTITY_END {0, 0, NULL} +#define HTML_ENTITY_LIST(map) ((map)->entity_list) static const struct html_entity_map entity_map[] = { - { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, - { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, - { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_utf_8, 338, 402, ent_uni_338_402 }, - { cs_utf_8, 710, 732, ent_uni_spacing }, - { cs_utf_8, 913, 982, ent_uni_greek }, - { cs_utf_8, 8194, 8260, ent_uni_punct }, - { cs_utf_8, 8364, 8364, ent_uni_euro }, - { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, - { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, - { cs_utf_8, 9674, 9674, ent_uni_9674 }, - { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, - { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_koi8r, 0xa3, 0xff, ent_koi8r }, - { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, - { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, - { cs_cp866, 0xc0, 0xff, ent_cp_866 }, - { cs_macroman, 0x0b, 0xff, ent_macroman }, + { cs_cp1252, { + HTML_ENTITY_MAP(0x80, 0x9f, ent_cp_1252), + /* cs_cp1252, */ + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1), + HTML_ENTITY_END + }}, + { cs_8859_1, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1), + HTML_ENTITY_END + }}, + { cs_8859_15, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_15), + HTML_ENTITY_END + }}, + { cs_utf_8, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1), + HTML_ENTITY_MAP(338, 402, ent_uni_338_402), + HTML_ENTITY_MAP(710, 732, ent_uni_spacing), + HTML_ENTITY_MAP(913, 982, ent_uni_greek), + HTML_ENTITY_MAP(8194, 8260, ent_uni_punct), + HTML_ENTITY_MAP(8364, 8364, ent_uni_euro), + HTML_ENTITY_MAP(8465, 8501, ent_uni_8465_8501), + HTML_ENTITY_MAP(8592, 9002, ent_uni_8592_9002), + HTML_ENTITY_MAP(9674, 9674, ent_uni_9674), + HTML_ENTITY_MAP(9824, 9830, ent_uni_9824_9830), + HTML_ENTITY_END + }}, + { cs_big5, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ), + HTML_ENTITY_END + }}, + { cs_gb2312, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ), + HTML_ENTITY_END + }}, + { cs_big5hkscs, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ), + HTML_ENTITY_END + }}, + { cs_sjis, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ), + HTML_ENTITY_END + }}, + { cs_eucjp, { + HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ), + HTML_ENTITY_END + }}, + { cs_koi8r, { + HTML_ENTITY_MAP(0xa3, 0xff, ent_koi8r ), + HTML_ENTITY_END + }}, + { cs_cp1251, { + HTML_ENTITY_MAP(0x80, 0xff, ent_cp_1251 ), + HTML_ENTITY_END + }}, + { cs_8859_5, { + HTML_ENTITY_MAP(0xc0, 0xff, ent_iso_8859_5 ), + HTML_ENTITY_END + }}, + { cs_cp866, { + HTML_ENTITY_MAP(0xc0, 0xff, ent_cp_866 ), + HTML_ENTITY_END + }}, + { cs_macroman, { + HTML_ENTITY_MAP(0x0b, 0xff, ent_macroman ), + HTML_ENTITY_END + }}, { cs_terminator } }; @@ -883,29 +932,35 @@ enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); unsigned char replacement[15]; int replacement_len; + const html_entity_map * charset_entity_map = NULL; ret = estrndup(old, oldlen); retlen = oldlen; if (!retlen) { goto empty_source; } + + for(j = 0; entity_map[j].charset != cs_terminator; j++) { + if(entity_map[j].charset == charset) { + charset_entity_map = &entity_map[j]; + } + } + if (all) { /* look for a match in the maps for this charset */ - for (j = 0; entity_map[j].charset != cs_terminator; j++) { - if (entity_map[j].charset != charset) - continue; + for (j = 0; HTML_ENTITY_LIST(charset_entity_map)[j].table; j++) { - for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { + for (k = HTML_ENTITY_LIST(charset_entity_map)[j].basechar; k <= HTML_ENTITY_LIST(charset_entity_map)[j].endchar; k++) { unsigned char entity[32]; int entity_length = 0; - if (entity_map[j].table[k - entity_map[j].basechar] == NULL) + if (HTML_ENTITY_LIST(charset_entity_map)[j].table[k - HTML_ENTITY_LIST(charset_entity_map)[j].basechar] == NULL) continue; entity[0] = '&'; - entity_length = strlen(entity_map[j].table[k - entity_map[j].basechar]); - strncpy(&entity[1], entity_map[j].table[k - entity_map[j].basechar], sizeof(entity) - 2); + entity_length = strlen(HTML_ENTITY_LIST(charset_entity_map)[j].table[k - HTML_ENTITY_LIST(charset_entity_map)[j].basechar]); + strncpy(&entity[1], HTML_ENTITY_LIST(charset_entity_map)[j].table[k - HTML_ENTITY_LIST(charset_entity_map)[j].basechar], sizeof(entity) - 2); entity[entity_length+1] = ';'; entity[entity_length+2] = '\0'; entity_length += 2; @@ -1077,17 +1132,25 @@ */ PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) { - int i, j, maxlen, len; - char *replaced; + int i, j, maxlen, len, prev; + char *replaced = NULL; enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); - int matches_map; + int matches_map, is_basic; + char *entity_value = NULL; + int entity_len = 0; + const html_entity_map * charset_entity_map = NULL; - maxlen = 2 * oldlen; - if (maxlen < 128) - maxlen = 128; - replaced = emalloc (maxlen); + maxlen = 0; + prev = 0; len = 0; + for(j = 0; entity_map[j].charset != cs_terminator; j++) { + if(entity_map[j].charset == charset) { + charset_entity_map = &entity_map[j]; + } + } + + i = 0; while (i < oldlen) { unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */ @@ -1095,20 +1158,19 @@ unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); matches_map = 0; - - if (len + 9 > maxlen) - replaced = erealloc (replaced, maxlen += 128); + is_basic = 0; + entity_value = NULL; + entity_len = 0; if (all) { /* look for a match in the maps for this charset */ unsigned char *rep = NULL; - for (j = 0; entity_map[j].charset != cs_terminator; j++) { - if (entity_map[j].charset == charset - && this_char >= entity_map[j].basechar - && this_char <= entity_map[j].endchar) { - rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar]; + for (j = 0; HTML_ENTITY_LIST(charset_entity_map)[j].table; j++) { + if (this_char >= HTML_ENTITY_LIST(charset_entity_map)[j].basechar + && this_char <= HTML_ENTITY_LIST(charset_entity_map)[j].endchar) { + rep = (unsigned char*)HTML_ENTITY_LIST(charset_entity_map)[j].table[this_char - HTML_ENTITY_LIST(charset_entity_map)[j].basechar]; if (rep == NULL) { /* there is no entity for this position; fall through and * just output the character itself */ @@ -1116,24 +1178,17 @@ } matches_map = 1; + entity_value = rep; + entity_len = strlen(rep); break; } } - - if (matches_map) { - replaced[len++] = '&'; - strcpy(replaced + len, rep); - len += strlen(rep); - replaced[len++] = ';'; - } } if (!matches_map) { - int is_basic = 0; - if (this_char == '&') { - memcpy(replaced + len, "&", sizeof("&") - 1); - len += sizeof("&") - 1; is_basic = 1; + entity_value = "&"; + entity_len = sizeof("&") - 1; } else { for (j = 0; basic_entities[j].charcode != 0; j++) { if ((basic_entities[j].charcode != this_char) || @@ -1142,31 +1197,65 @@ continue; } - memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen); - len += basic_entities[j].entitylen; - is_basic = 1; + + entity_value = basic_entities[j].entity; + entity_len = basic_entities[j].entitylen; + break; } } + } - if (!is_basic) { - /* a wide char without a named entity; pass through the original sequence */ - if (mbseqlen > 1) { - memcpy(replaced + len, mbsequence, mbseqlen); - len += mbseqlen; - } else { - replaced[len++] = (unsigned char)this_char; - } + if(matches_map || is_basic) { + /* approxmiate new length */ + int newlen = len + (i - prev) + entity_len + 2; + if(replaced == NULL) { + maxlen = (oldlen | 0x7f) + 1; /* round to 128 */ + replaced = emalloc (maxlen); + } + if (newlen > maxlen) { + /* maxlen += 128; */ + maxlen = (newlen | 0x7f) + 1; + replaced = erealloc (replaced, maxlen); + } + /* memcpy & set i to value */ + memcpy(&replaced[len], &old[prev], (i - mbseqlen) - prev); + len += ((i - mbseqlen) - prev); + prev = i; + if(is_basic) { + strcpy(replaced + len, entity_value); + len += entity_len; + } else if(matches_map) { + replaced[len++] = '&'; + strcpy(replaced + len, entity_value); + len += entity_len; + replaced[len++] = ';'; } } } - replaced[len] = '\0'; - *newlen = len; - - return replaced; + if(replaced) { + if(prev != oldlen) { + /* approxmiate new length */ + int newlen = len + (i - prev) + 1; + if (newlen > maxlen) { + /* maxlen += 128; */ + maxlen = (newlen | 0x7f) + 1; + replaced = erealloc (replaced, maxlen); + } + /* memcpy & set i to value */ + memcpy(&replaced[len], &old[prev], i - prev); + len += i - prev; + } + replaced[len] = '\0'; + *newlen = len; + } else { + replaced = old; + *newlen = oldlen; + } + return replaced; } /* }}} */ @@ -1179,13 +1268,26 @@ int len; long quote_style = ENT_COMPAT; char *replaced; + zval *zstr; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len, "e_style, &hint_charset, &hint_charset_len) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z|ls", &zstr, "e_style, &hint_charset, &hint_charset_len) == FAILURE) { return; } + if(Z_TYPE_P(zstr) != IS_STRING) { + convert_to_string_ex(&zstr); + } + + str = Z_STRVAL_P(zstr); + str_len = Z_STRLEN_P(zstr); + replaced = php_escape_html_entities(str, str_len, &len, all, quote_style, hint_charset TSRMLS_CC); - RETVAL_STRINGL(replaced, len, 0); + + if(replaced == str) { + RETVAL_STRINGL(replaced, len, 1); + } else { + RETVAL_STRINGL(replaced, len, 0); + } } /* }}} */ @@ -1314,6 +1416,9 @@ int i, j; char ind[2]; enum entity_charset charset = determine_charset(NULL TSRMLS_CC); + const html_entity_map * charset_entity_map = NULL; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, "e_style) == FAILURE) { return; @@ -1322,22 +1427,28 @@ array_init(return_value); ind[1] = 0; + + for(j = 0; entity_map[j].charset != cs_terminator; j++) { + if(entity_map[j].charset == charset) { + charset_entity_map = &entity_map[j]; + } + } switch (which) { case HTML_ENTITIES: - for (j=0; entity_map[j].charset != cs_terminator; j++) { - if (entity_map[j].charset != charset) - continue; - for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) { - char buffer[16]; - - if (entity_map[j].table[i] == NULL) - continue; - /* what about wide chars here ?? */ - ind[0] = i + entity_map[j].basechar; - sprintf(buffer, "&%s;", entity_map[j].table[i]); - add_assoc_string(return_value, ind, buffer, 1); + if(charset_entity_map) { + for(j = 0; HTML_ENTITY_LIST(charset_entity_map)[j].table ; j++) { + for (i = 0; i <= HTML_ENTITY_LIST(charset_entity_map)[j].endchar - HTML_ENTITY_LIST(charset_entity_map)[j].basechar; i++) { + char buffer[16]; + + if (HTML_ENTITY_LIST(charset_entity_map)[j].table[i] == NULL) + continue; + /* what about wide chars here ?? */ + ind[0] = i + HTML_ENTITY_LIST(charset_entity_map)[j].basechar; + sprintf(buffer, "&%s;", HTML_ENTITY_LIST(charset_entity_map)[j].table[i]); + add_assoc_string(return_value, ind, buffer, 1); + } } } /* break thru */