Fix a bug in UTF-16/32 detection.

Add a missing buffer length check. 

The upstream bug : http://bugs.icu-project.org/trac/ticket/10318

While I'm at it, I'm adding two patches that I forgot to include 
in the previous check-in (http://crrev.com/121777 http://crrev.com/158118 )

BUG=275803
TEST=SyzyASAN does not complain any more after this version of ICU is rolled.
R=tsepez@chromium.org

Review URL: https://codereview.chromium.org/22911033

git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/icu46@219032 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
diff --git a/README.chromium b/README.chromium
index 4cabef4..76acd4b 100644
--- a/README.chromium
+++ b/README.chromium
@@ -250,3 +250,10 @@
 
 24. Fix compilation errors on VS2012.
     - patches/vs2012.patch
+
+25. Fix a buffer overflow in UTF-16/32 detection.
+    - patches/csetdet.patch
+    - upstream bug: http://bugs.icu-project.org/trac/ticket/10318
+
+
+
diff --git a/patches/csetdet.patch b/patches/csetdet.patch
new file mode 100644
index 0000000..73df253
--- /dev/null
+++ b/patches/csetdet.patch
@@ -0,0 +1,35 @@
+Index: source/i18n/csrucode.cpp
+===================================================================
+--- source/i18n/csrucode.cpp	(revision 214189)
++++ source/i18n/csrucode.cpp	(working copy)
+@@ -31,8 +31,9 @@
+ int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn)
+ {
+     const uint8_t *input = textIn->fRawInput;
++    int32_t length = textIn->fRawLength;
+ 
+-    if (input[0] == 0xFE && input[1] == 0xFF) {
++    if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) {
+         return 100;
+     }
+ 
+@@ -53,8 +54,9 @@
+ int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn)
+ {
+     const uint8_t *input = textIn->fRawInput;
++    int32_t length = textIn->fRawLength;
+ 
+-    if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
++    if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
+         return 100;
+     }
+ 
+@@ -76,7 +78,7 @@
+     bool hasBOM = FALSE;
+     int32_t confidence = 0;
+ 
+-    if (getChar(input, 0) == 0x0000FEFFUL) {
++    if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
+         hasBOM = TRUE;
+     }
+ 
diff --git a/patches/ubrk.patch b/patches/ubrk.patch
new file mode 100644
index 0000000..51213fd
--- /dev/null
+++ b/patches/ubrk.patch
@@ -0,0 +1,32 @@
+Index: source/common/ubrk.cpp
+===================================================================
+--- source/common/ubrk.cpp	(revision 120256)
++++ source/common/ubrk.cpp	(working copy)
+@@ -166,6 +166,13 @@
+              int32_t         textLength,
+              UErrorCode*     status)
+ {
++    if (bi == NULL) {
++        if (U_SUCCESS(*status)) {
++            *status = U_ILLEGAL_ARGUMENT_ERROR;
++        }
++        return;
++    }
++
+     BreakIterator *brit = (BreakIterator *)bi;
+     UText  ut = UTEXT_INITIALIZER;
+     utext_openUChars(&ut, text, textLength, status);
+@@ -181,6 +188,13 @@
+              UText          *text,
+              UErrorCode     *status)
+ {
++    if (bi == NULL) {
++        if (U_SUCCESS(*status)) {
++            *status = U_ILLEGAL_ARGUMENT_ERROR;
++        }
++        return;
++    }
++
+     RuleBasedBreakIterator *brit = (RuleBasedBreakIterator *)bi;
+     brit->RuleBasedBreakIterator::setText(text, *status);
+ }
diff --git a/patches/utext.patch b/patches/utext.patch
new file mode 100644
index 0000000..d92347f
--- /dev/null
+++ b/patches/utext.patch
@@ -0,0 +1,76 @@
+Index: test/cintltst/utexttst.c
+===================================================================
+--- test/cintltst/utexttst.c	(revision 29355)
++++ test/cintltst/utexttst.c	(revision 29356)
+@@ -1,6 +1,6 @@
+ /********************************************************************
+  * COPYRIGHT: 
+- * Copyright (c) 2005-2009, International Business Machines Corporation and
++ * Copyright (c) 2005-2011, International Business Machines Corporation and
+  * others. All Rights Reserved.
+  ********************************************************************/
+ /*
+@@ -210,6 +210,10 @@
+         UChar     uString[]  = {0x41, 0x42, 0x43, 0};
+         UChar     buf[100];
+         int32_t   i;
++        /* Test pinning of input bounds */
++        UChar     uString2[]  = {0x41, 0x42, 0x43, 0x44, 0x45,
++                                 0x46, 0x47, 0x48, 0x49, 0x4A, 0};
++        UChar *   uString2Ptr = uString2 + 5;
+ 
+         status = U_ZERO_ERROR;
+         uta = utext_openUChars(NULL, uString, -1, &status);
+@@ -228,6 +232,20 @@
+         i = u_strcmp(uString, buf);
+         TEST_ASSERT(i == 0);
+         utext_close(uta);
++
++        /* Test pinning of input bounds */
++        status = U_ZERO_ERROR;
++        uta = utext_openUChars(NULL, uString2Ptr, -1, &status);
++        TEST_SUCCESS(status);
++
++        status = U_ZERO_ERROR;
++        memset(buf, 0, sizeof(buf));
++        i = utext_extract(uta, -3, 20, buf, 100, &status);
++        TEST_SUCCESS(status);
++        TEST_ASSERT(i == u_strlen(uString2Ptr));
++        i = u_strcmp(uString2Ptr, buf);
++        TEST_ASSERT(i == 0);
++        utext_close(uta);
+     }
+ 
+     {
+Index: common/utext.cpp
+===================================================================
+--- common/utext.cpp	(revision 29355)
++++ common/utext.cpp	(revision 29356)
+@@ -1,7 +1,7 @@
+ /*
+ *******************************************************************************
+ *
+-*   Copyright (C) 2005-2010, International Business Machines
++*   Copyright (C) 2005-2011, International Business Machines
+ *   Corporation and others.  All Rights Reserved.
+ *
+ *******************************************************************************
+@@ -2846,7 +2846,6 @@
+         return 0;
+     }
+ 
+-    const UChar *s=(const UChar *)ut->context;
+     int32_t si, di;
+ 
+     int32_t start32;
+@@ -2856,8 +2855,8 @@
+     //   Pins 'start' to the length of the string, if it came in out-of-bounds.
+     //   Snaps 'start' to the beginning of a code point.
+     ucstrTextAccess(ut, start, TRUE);
+-    U_ASSERT(start <= INT32_MAX);
+-    start32 = (int32_t)start;
++    const UChar *s=ut->chunkContents;
++    start32 = ut->chunkOffset;
+ 
+     int32_t strLength=(int32_t)ut->a;
+     if (strLength >= 0) {
diff --git a/source/i18n/csrucode.cpp b/source/i18n/csrucode.cpp
index 99a76d8..3789fa9 100644
--- a/source/i18n/csrucode.cpp
+++ b/source/i18n/csrucode.cpp
@@ -31,8 +31,9 @@
 int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn)
 {
     const uint8_t *input = textIn->fRawInput;
+    int32_t length = textIn->fRawLength;
 
-    if (input[0] == 0xFE && input[1] == 0xFF) {
+    if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) {
         return 100;
     }
 
@@ -53,8 +54,9 @@
 int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn)
 {
     const uint8_t *input = textIn->fRawInput;
+    int32_t length = textIn->fRawLength;
 
-    if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
+    if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
         return 100;
     }
 
@@ -76,7 +78,7 @@
     bool hasBOM = FALSE;
     int32_t confidence = 0;
 
-    if (getChar(input, 0) == 0x0000FEFFUL) {
+    if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
         hasBOM = TRUE;
     }