Index: ext/standard/html.c
===================================================================
RCS file: /repository/php-src/ext/standard/html.c,v
retrieving revision 1.111.2.2
diff -u -r1.111.2.2 html.c
--- ext/standard/html.c 25 Feb 2006 21:32:11 -0000 1.111.2.2
+++ ext/standard/html.c 22 Sep 2006 22:23:02 -0000
@@ -379,38 +379,87 @@
"#733", "#731", "#711"
};
-struct html_entity_map {
+typedef struct html_entity_map {
enum entity_charset charset; /* charset identifier */
- unsigned short basechar; /* char code at start of table */
- unsigned short endchar; /* last char code in the table */
- entity_table_t *table; /* the table of mappings */
-};
+ struct _entity_list {
+ unsigned short basechar; /* char code at start of table */
+ unsigned short endchar; /* last char code in the table */
+ entity_table_t *table; /* the table of mappings */
+ } entity_list[16];
+} html_entity_map;
+
+#define HTML_ENTITY_MAP(a, b, c) { (a) , (b), (c) }
+#define HTML_ENTITY_END {0, 0, NULL}
+#define HTML_ENTITY_LIST(map) ((map)->entity_list)
static const struct html_entity_map entity_map[] = {
- { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
- { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
- { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_utf_8, 338, 402, ent_uni_338_402 },
- { cs_utf_8, 710, 732, ent_uni_spacing },
- { cs_utf_8, 913, 982, ent_uni_greek },
- { cs_utf_8, 8194, 8260, ent_uni_punct },
- { cs_utf_8, 8364, 8364, ent_uni_euro },
- { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
- { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
- { cs_utf_8, 9674, 9674, ent_uni_9674 },
- { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
- { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_koi8r, 0xa3, 0xff, ent_koi8r },
- { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
- { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
- { cs_cp866, 0xc0, 0xff, ent_cp_866 },
- { cs_macroman, 0x0b, 0xff, ent_macroman },
+ { cs_cp1252, {
+ HTML_ENTITY_MAP(0x80, 0x9f, ent_cp_1252),
+ /* cs_cp1252, */
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1),
+ HTML_ENTITY_END
+ }},
+ { cs_8859_1, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1),
+ HTML_ENTITY_END
+ }},
+ { cs_8859_15, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_15),
+ HTML_ENTITY_END
+ }},
+ { cs_utf_8, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1),
+ HTML_ENTITY_MAP(338, 402, ent_uni_338_402),
+ HTML_ENTITY_MAP(710, 732, ent_uni_spacing),
+ HTML_ENTITY_MAP(913, 982, ent_uni_greek),
+ HTML_ENTITY_MAP(8194, 8260, ent_uni_punct),
+ HTML_ENTITY_MAP(8364, 8364, ent_uni_euro),
+ HTML_ENTITY_MAP(8465, 8501, ent_uni_8465_8501),
+ HTML_ENTITY_MAP(8592, 9002, ent_uni_8592_9002),
+ HTML_ENTITY_MAP(9674, 9674, ent_uni_9674),
+ HTML_ENTITY_MAP(9824, 9830, ent_uni_9824_9830),
+ HTML_ENTITY_END
+ }},
+ { cs_big5, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ),
+ HTML_ENTITY_END
+ }},
+ { cs_gb2312, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ),
+ HTML_ENTITY_END
+ }},
+ { cs_big5hkscs, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ),
+ HTML_ENTITY_END
+ }},
+ { cs_sjis, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ),
+ HTML_ENTITY_END
+ }},
+ { cs_eucjp, {
+ HTML_ENTITY_MAP(0xa0, 0xff, ent_iso_8859_1 ),
+ HTML_ENTITY_END
+ }},
+ { cs_koi8r, {
+ HTML_ENTITY_MAP(0xa3, 0xff, ent_koi8r ),
+ HTML_ENTITY_END
+ }},
+ { cs_cp1251, {
+ HTML_ENTITY_MAP(0x80, 0xff, ent_cp_1251 ),
+ HTML_ENTITY_END
+ }},
+ { cs_8859_5, {
+ HTML_ENTITY_MAP(0xc0, 0xff, ent_iso_8859_5 ),
+ HTML_ENTITY_END
+ }},
+ { cs_cp866, {
+ HTML_ENTITY_MAP(0xc0, 0xff, ent_cp_866 ),
+ HTML_ENTITY_END
+ }},
+ { cs_macroman, {
+ HTML_ENTITY_MAP(0x0b, 0xff, ent_macroman ),
+ HTML_ENTITY_END
+ }},
{ cs_terminator }
};
@@ -883,29 +932,35 @@
enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
unsigned char replacement[15];
int replacement_len;
+ const html_entity_map * charset_entity_map = NULL;
ret = estrndup(old, oldlen);
retlen = oldlen;
if (!retlen) {
goto empty_source;
}
+
+ for(j = 0; entity_map[j].charset != cs_terminator; j++) {
+ if(entity_map[j].charset == charset) {
+ charset_entity_map = &entity_map[j];
+ }
+ }
+
if (all) {
/* look for a match in the maps for this charset */
- for (j = 0; entity_map[j].charset != cs_terminator; j++) {
- if (entity_map[j].charset != charset)
- continue;
+ for (j = 0; HTML_ENTITY_LIST(charset_entity_map)[j].table; j++) {
- for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
+ for (k = HTML_ENTITY_LIST(charset_entity_map)[j].basechar; k <= HTML_ENTITY_LIST(charset_entity_map)[j].endchar; k++) {
unsigned char entity[32];
int entity_length = 0;
- if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
+ if (HTML_ENTITY_LIST(charset_entity_map)[j].table[k - HTML_ENTITY_LIST(charset_entity_map)[j].basechar] == NULL)
continue;
entity[0] = '&';
- entity_length = strlen(entity_map[j].table[k - entity_map[j].basechar]);
- strncpy(&entity[1], entity_map[j].table[k - entity_map[j].basechar], sizeof(entity) - 2);
+ entity_length = strlen(HTML_ENTITY_LIST(charset_entity_map)[j].table[k - HTML_ENTITY_LIST(charset_entity_map)[j].basechar]);
+ strncpy(&entity[1], HTML_ENTITY_LIST(charset_entity_map)[j].table[k - HTML_ENTITY_LIST(charset_entity_map)[j].basechar], sizeof(entity) - 2);
entity[entity_length+1] = ';';
entity[entity_length+2] = '\0';
entity_length += 2;
@@ -1077,17 +1132,25 @@
*/
PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
{
- int i, j, maxlen, len;
- char *replaced;
+ int i, j, maxlen, len, prev;
+ char *replaced = NULL;
enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
- int matches_map;
+ int matches_map, is_basic;
+ char *entity_value = NULL;
+ int entity_len = 0;
+ const html_entity_map * charset_entity_map = NULL;
- maxlen = 2 * oldlen;
- if (maxlen < 128)
- maxlen = 128;
- replaced = emalloc (maxlen);
+ maxlen = 0;
+ prev = 0;
len = 0;
+ for(j = 0; entity_map[j].charset != cs_terminator; j++) {
+ if(entity_map[j].charset == charset) {
+ charset_entity_map = &entity_map[j];
+ }
+ }
+
+
i = 0;
while (i < oldlen) {
unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
@@ -1095,20 +1158,19 @@
unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen);
matches_map = 0;
-
- if (len + 9 > maxlen)
- replaced = erealloc (replaced, maxlen += 128);
+ is_basic = 0;
+ entity_value = NULL;
+ entity_len = 0;
if (all) {
/* look for a match in the maps for this charset */
unsigned char *rep = NULL;
- for (j = 0; entity_map[j].charset != cs_terminator; j++) {
- if (entity_map[j].charset == charset
- && this_char >= entity_map[j].basechar
- && this_char <= entity_map[j].endchar) {
- rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
+ for (j = 0; HTML_ENTITY_LIST(charset_entity_map)[j].table; j++) {
+ if (this_char >= HTML_ENTITY_LIST(charset_entity_map)[j].basechar
+ && this_char <= HTML_ENTITY_LIST(charset_entity_map)[j].endchar) {
+ rep = (unsigned char*)HTML_ENTITY_LIST(charset_entity_map)[j].table[this_char - HTML_ENTITY_LIST(charset_entity_map)[j].basechar];
if (rep == NULL) {
/* there is no entity for this position; fall through and
* just output the character itself */
@@ -1116,24 +1178,17 @@
}
matches_map = 1;
+ entity_value = rep;
+ entity_len = strlen(rep);
break;
}
}
-
- if (matches_map) {
- replaced[len++] = '&';
- strcpy(replaced + len, rep);
- len += strlen(rep);
- replaced[len++] = ';';
- }
}
if (!matches_map) {
- int is_basic = 0;
-
if (this_char == '&') {
- memcpy(replaced + len, "&", sizeof("&") - 1);
- len += sizeof("&") - 1;
is_basic = 1;
+ entity_value = "&";
+ entity_len = sizeof("&") - 1;
} else {
for (j = 0; basic_entities[j].charcode != 0; j++) {
if ((basic_entities[j].charcode != this_char) ||
@@ -1142,31 +1197,65 @@
continue;
}
- memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
- len += basic_entities[j].entitylen;
-
is_basic = 1;
+
+ entity_value = basic_entities[j].entity;
+ entity_len = basic_entities[j].entitylen;
+
break;
}
}
+ }
- if (!is_basic) {
- /* a wide char without a named entity; pass through the original sequence */
- if (mbseqlen > 1) {
- memcpy(replaced + len, mbsequence, mbseqlen);
- len += mbseqlen;
- } else {
- replaced[len++] = (unsigned char)this_char;
- }
+ if(matches_map || is_basic) {
+ /* approxmiate new length */
+ int newlen = len + (i - prev) + entity_len + 2;
+ if(replaced == NULL) {
+ maxlen = (oldlen | 0x7f) + 1; /* round to 128 */
+ replaced = emalloc (maxlen);
+ }
+ if (newlen > maxlen) {
+ /* maxlen += 128; */
+ maxlen = (newlen | 0x7f) + 1;
+ replaced = erealloc (replaced, maxlen);
+ }
+ /* memcpy & set i to value */
+ memcpy(&replaced[len], &old[prev], (i - mbseqlen) - prev);
+ len += ((i - mbseqlen) - prev);
+ prev = i;
+ if(is_basic) {
+ strcpy(replaced + len, entity_value);
+ len += entity_len;
+ } else if(matches_map) {
+ replaced[len++] = '&';
+ strcpy(replaced + len, entity_value);
+ len += entity_len;
+ replaced[len++] = ';';
}
}
}
- replaced[len] = '\0';
- *newlen = len;
-
- return replaced;
+ if(replaced) {
+ if(prev != oldlen) {
+ /* approxmiate new length */
+ int newlen = len + (i - prev) + 1;
+ if (newlen > maxlen) {
+ /* maxlen += 128; */
+ maxlen = (newlen | 0x7f) + 1;
+ replaced = erealloc (replaced, maxlen);
+ }
+ /* memcpy & set i to value */
+ memcpy(&replaced[len], &old[prev], i - prev);
+ len += i - prev;
+ }
+ replaced[len] = '\0';
+ *newlen = len;
+ } else {
+ replaced = old;
+ *newlen = oldlen;
+ }
+ return replaced;
}
/* }}} */
@@ -1179,13 +1268,26 @@
int len;
long quote_style = ENT_COMPAT;
char *replaced;
+ zval *zstr;
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len, "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z|ls", &zstr, "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
return;
}
+ if(Z_TYPE_P(zstr) != IS_STRING) {
+ convert_to_string_ex(&zstr);
+ }
+
+ str = Z_STRVAL_P(zstr);
+ str_len = Z_STRLEN_P(zstr);
+
replaced = php_escape_html_entities(str, str_len, &len, all, quote_style, hint_charset TSRMLS_CC);
- RETVAL_STRINGL(replaced, len, 0);
+
+ if(replaced == str) {
+ RETVAL_STRINGL(replaced, len, 1);
+ } else {
+ RETVAL_STRINGL(replaced, len, 0);
+ }
}
/* }}} */
@@ -1314,6 +1416,9 @@
int i, j;
char ind[2];
enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
+ const html_entity_map * charset_entity_map = NULL;
+
+
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, "e_style) == FAILURE) {
return;
@@ -1322,22 +1427,28 @@
array_init(return_value);
ind[1] = 0;
+
+ for(j = 0; entity_map[j].charset != cs_terminator; j++) {
+ if(entity_map[j].charset == charset) {
+ charset_entity_map = &entity_map[j];
+ }
+ }
switch (which) {
case HTML_ENTITIES:
- for (j=0; entity_map[j].charset != cs_terminator; j++) {
- if (entity_map[j].charset != charset)
- continue;
- for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
- char buffer[16];
-
- if (entity_map[j].table[i] == NULL)
- continue;
- /* what about wide chars here ?? */
- ind[0] = i + entity_map[j].basechar;
- sprintf(buffer, "&%s;", entity_map[j].table[i]);
- add_assoc_string(return_value, ind, buffer, 1);
+ if(charset_entity_map) {
+ for(j = 0; HTML_ENTITY_LIST(charset_entity_map)[j].table ; j++) {
+ for (i = 0; i <= HTML_ENTITY_LIST(charset_entity_map)[j].endchar - HTML_ENTITY_LIST(charset_entity_map)[j].basechar; i++) {
+ char buffer[16];
+
+ if (HTML_ENTITY_LIST(charset_entity_map)[j].table[i] == NULL)
+ continue;
+ /* what about wide chars here ?? */
+ ind[0] = i + HTML_ENTITY_LIST(charset_entity_map)[j].basechar;
+ sprintf(buffer, "&%s;", HTML_ENTITY_LIST(charset_entity_map)[j].table[i]);
+ add_assoc_string(return_value, ind, buffer, 1);
+ }
}
}
/* break thru */