Sophie

Sophie

distrib > Mandriva > 2009.0 > i586 > by-pkgid > 85e22736dde948b4af8fc20c2ecb80cd > files > 16

php-5.2.6-18.14mdv2009.0.src.rpm


 http://svn.php.net/viewvc/php/php-src/branches/PHP_5_2/ext/standard/html.c?r1=272374&r2=291821&view=patch

--- ext/standard/html.c	2008-04-11 15:01:25.000000000 -0400
+++ ext/standard/html.c.oden	2010-01-15 11:15:50.000000000 -0500
@@ -484,15 +484,31 @@ struct basic_entities_dec {
 			}                        \
 			mbseq[mbpos++] = (mbchar); }
 
+/* skip one byte and return */
+#define MB_FAILURE(pos) do { \
+	*newpos = pos + 1; \
+	*status = FAILURE; \
+	return 0; \
+} while (0)
+
 #define CHECK_LEN(pos, chars_need)			\
-	if((str_len - (pos)) < chars_need) {	\
-		*status = FAILURE;					\
-		return 0;							\
+	if (chars_need < 1) {						\
+		if((str_len - (pos)) < chars_need) {	\
+			*newpos = pos;						\
+			*status = FAILURE;					\
+			return 0;							\
+		}										\
+	} else {									\
+		if((str_len - (pos)) < chars_need) {	\
+			*newpos = pos + 1;					\
+			*status = FAILURE;					\
+			return 0;							\
+		}										\
 	}
 
 /* {{{ get_next_char
  */
-inline static unsigned short get_next_char(enum entity_charset charset,
+inline static unsigned int get_next_char(enum entity_charset charset,
 		unsigned char * str,
 		int str_len,
 		int * newpos,
@@ -503,205 +519,191 @@ inline static unsigned short get_next_ch
 	int pos = *newpos;
 	int mbpos = 0;
 	int mbspace = *mbseqlen;
-	unsigned short this_char = str[pos++];
+	unsigned int this_char = 0;
 	unsigned char next_char;
 
 	*status = SUCCESS;
-	
+
 	if (mbspace <= 0) {
 		*mbseqlen = 0;
-		return this_char;
+		CHECK_LEN(pos, 1);
+		*newpos = pos + 1;
+		return str[pos];
 	}
-	
-	MB_WRITE((unsigned char)this_char);
-	
+
 	switch (charset) {
 		case cs_utf_8:
 			{
-				unsigned long utf = 0;
-				int stat = 0;
-				int more = 1;
-
-				/* unpack utf-8 encoding into a wide char.
-				 * Code stolen from the mbstring extension */
-
-				do {
-					if (this_char < 0x80) {
-						more = 0;
-						if(stat) {
-							/* we didn't finish the UTF sequence correctly */
-							*status = FAILURE;
-						}
-						break;
-					} else if (this_char < 0xc0) {
-						switch (stat) {
-							case 0x10:	/* 2, 2nd */
-							case 0x21:	/* 3, 3rd */
-							case 0x32:	/* 4, 4th */
-							case 0x43:	/* 5, 5th */
-							case 0x54:	/* 6, 6th */
-								/* last byte in sequence */
-								more = 0;
-								utf |= (this_char & 0x3f);
-								this_char = (unsigned short)utf;
-								break;
-							case 0x20:	/* 3, 2nd */
-							case 0x31:	/* 4, 3rd */
-							case 0x42:	/* 5, 4th */
-							case 0x53:	/* 6, 5th */
-								/* penultimate char */
-								utf |= ((this_char & 0x3f) << 6);
-								stat++;
-								break;
-							case 0x30:	/* 4, 2nd */
-							case 0x41:	/* 5, 3rd */
-							case 0x52:	/* 6, 4th */
-								utf |= ((this_char & 0x3f) << 12);
-								stat++;
-								break;
-							case 0x40:	/* 5, 2nd */
-							case 0x51:
-								utf |= ((this_char & 0x3f) << 18);
-								stat++;
-								break;
-							case 0x50:	/* 6, 2nd */
-								utf |= ((this_char & 0x3f) << 24);
-								stat++;
-								break;
-							default:
-								/* invalid */
-								*status = FAILURE;
-								more = 0;
-						}
-					}
-					/* lead byte */
-					else if (this_char < 0xe0) {
-						stat = 0x10;	/* 2 byte */
-						utf = (this_char & 0x1f) << 6;
-						CHECK_LEN(pos, 1);
-					} else if (this_char < 0xf0) {
-						stat = 0x20;	/* 3 byte */
-						utf = (this_char & 0xf) << 12;
-						CHECK_LEN(pos, 2);
-					} else if (this_char < 0xf8) {
-						stat = 0x30;	/* 4 byte */
-						utf = (this_char & 0x7) << 18;
-						CHECK_LEN(pos, 3);
-					} else if (this_char < 0xfc) {
-						stat = 0x40;	/* 5 byte */
-						utf = (this_char & 0x3) << 24;
-						CHECK_LEN(pos, 4);
-					} else if (this_char < 0xfe) {
-						stat = 0x50;	/* 6 byte */
-						utf = (this_char & 0x1) << 30;
-						CHECK_LEN(pos, 5);
-					} else {
-						/* invalid; bail */
-						more = 0;
-						*status = FAILURE;
-						break;
+				unsigned char c;
+				CHECK_LEN(pos, 1);
+				c = str[pos];
+				if (c < 0x80) {
+					MB_WRITE(c);
+					this_char = c;
+					pos++;
+				} else if (c < 0xc0) {
+					MB_FAILURE(pos);
+				} else if (c < 0xe0) {
+					CHECK_LEN(pos, 2);
+					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
+						MB_FAILURE(pos);
 					}
-
-					if (more) {
-						this_char = str[pos++];
-						MB_WRITE((unsigned char)this_char);
+					this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
+					if (this_char < 0x80) {
+						MB_FAILURE(pos);
 					}
-				} while (more);
+					MB_WRITE((unsigned char)c);
+					MB_WRITE((unsigned char)str[pos + 1]);
+					pos += 2;
+				} else if (c < 0xf0) {
+					CHECK_LEN(pos, 3);
+					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
+						MB_FAILURE(pos);
+					}
+					if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
+						MB_FAILURE(pos);
+					}
+					this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
+					if (this_char < 0x800) {
+						MB_FAILURE(pos);
+					} else if (this_char >= 0xd800 && this_char <= 0xdfff) {
+						MB_FAILURE(pos);
+					}
+					MB_WRITE((unsigned char)c);
+					MB_WRITE((unsigned char)str[pos + 1]);
+					MB_WRITE((unsigned char)str[pos + 2]);
+					pos += 3;
+				} else if (c < 0xf8) {
+					CHECK_LEN(pos, 4);
+					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
+						MB_FAILURE(pos);
+					}
+					if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
+						MB_FAILURE(pos);
+					}
+					if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
+						MB_FAILURE(pos);
+					}
+					this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
+					if (this_char < 0x10000) {
+						MB_FAILURE(pos);
+					}
+					MB_WRITE((unsigned char)c);
+					MB_WRITE((unsigned char)str[pos + 1]);
+					MB_WRITE((unsigned char)str[pos + 2]);
+					MB_WRITE((unsigned char)str[pos + 3]);
+					pos += 4;
+				} else {
+					MB_FAILURE(pos);
+				}
 			}
 			break;
 		case cs_big5:
 		case cs_gb2312:
 		case cs_big5hkscs:
 			{
+				CHECK_LEN(pos, 1);
+				this_char = str[pos++];
 				/* check if this is the first of a 2-byte sequence */
-				if (this_char >= 0xa1 && this_char <= 0xfe) {
+				if (this_char >= 0x81 && this_char <= 0xfe) {
 					/* peek at the next char */
 					CHECK_LEN(pos, 1);
-					next_char = str[pos];
+					next_char = str[pos++];
 					if ((next_char >= 0x40 && next_char <= 0x7e) ||
 							(next_char >= 0xa1 && next_char <= 0xfe)) {
 						/* yes, this a wide char */
-						this_char <<= 8;
+						MB_WRITE(this_char);
 						MB_WRITE(next_char);
-						this_char |= next_char;
-						pos++;
+						this_char = (this_char << 8) | next_char;
+					} else {
+						MB_FAILURE(pos);
 					}
-					
+				} else {
+					MB_WRITE(this_char);
 				}
-				break;
 			}
+			break;
 		case cs_sjis:
 			{
+				CHECK_LEN(pos, 1);
+				this_char = str[pos++];
 				/* check if this is the first of a 2-byte sequence */
-				if ( (this_char >= 0x81 && this_char <= 0x9f) ||
-					 (this_char >= 0xe0 && this_char <= 0xef)
-					) {
+				if ((this_char >= 0x81 && this_char <= 0x9f) ||
+					(this_char >= 0xe0 && this_char <= 0xfc)) {
 					/* peek at the next char */
 					CHECK_LEN(pos, 1);
-					next_char = str[pos];
+					next_char = str[pos++];
 					if ((next_char >= 0x40 && next_char <= 0x7e) ||
 						(next_char >= 0x80 && next_char <= 0xfc))
 					{
 						/* yes, this a wide char */
-						this_char <<= 8;
+						MB_WRITE(this_char);
 						MB_WRITE(next_char);
-						this_char |= next_char;
-						pos++;
+						this_char = (this_char << 8) | next_char;
+					} else {
+						MB_FAILURE(pos);
 					}
-					
+				} else {
+					MB_WRITE(this_char);
 				}
 				break;
 			}
 		case cs_eucjp:
 			{
+				CHECK_LEN(pos, 1);
+				this_char = str[pos++];
 				/* check if this is the first of a multi-byte sequence */
 				if (this_char >= 0xa1 && this_char <= 0xfe) {
 					/* peek at the next char */
 					CHECK_LEN(pos, 1);
-					next_char = str[pos];
+					next_char = str[pos++];
 					if (next_char >= 0xa1 && next_char <= 0xfe) {
 						/* yes, this a jis kanji char */
-						this_char <<= 8;
+						MB_WRITE(this_char);
 						MB_WRITE(next_char);
-						this_char |= next_char;
-						pos++;
+						this_char = (this_char << 8) | next_char;
+					} else {
+						MB_FAILURE(pos);
 					}
-					
 				} else if (this_char == 0x8e) {
 					/* peek at the next char */
 					CHECK_LEN(pos, 1);
-					next_char = str[pos];
+					next_char = str[pos++];
 					if (next_char >= 0xa1 && next_char <= 0xdf) {
 						/* JIS X 0201 kana */
-						this_char <<= 8;
+						MB_WRITE(this_char);
 						MB_WRITE(next_char);
-						this_char |= next_char;
-						pos++;
+						this_char = (this_char << 8) | next_char;
+					} else {
+						MB_FAILURE(pos);
 					}
-					
 				} else if (this_char == 0x8f) {
 					/* peek at the next two char */
 					unsigned char next2_char;
 					CHECK_LEN(pos, 2);
 					next_char = str[pos];
-					next2_char = str[pos+1];
+					next2_char = str[pos + 1];
+					pos += 2;
 					if ((next_char >= 0xa1 && next_char <= 0xfe) &&
 						(next2_char >= 0xa1 && next2_char <= 0xfe)) {
 						/* JIS X 0212 hojo-kanji */
-						this_char <<= 8;
+						MB_WRITE(this_char);
 						MB_WRITE(next_char);
-						this_char |= next_char;
-						pos++;
-						this_char <<= 8;
 						MB_WRITE(next2_char);
-						this_char |= next2_char;
-						pos++;
+						this_char = (this_char << 16) | (next_char << 8) | next2_char;
+					} else {
+						MB_FAILURE(pos);
 					}
-					
+				} else {
+					MB_WRITE(this_char);
 				}
 				break;
 			}
 		default:
+			/* single-byte charsets */
+			CHECK_LEN(pos, 1);
+			this_char = str[pos++];
+			MB_WRITE(this_char);
 			break;
 	}
 	MB_RETURN;
@@ -1132,7 +1134,7 @@ PHPAPI char *php_escape_html_entities_ex
 		unsigned char mbsequence[16];	/* allow up to 15 characters in a multibyte sequence */
 		int mbseqlen = sizeof(mbsequence);
 		int status = SUCCESS;
-		unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
+		unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
 
 		if(status == FAILURE) {
 			/* invalid MB sequence */