add LOCAL_CLANG_PREREQ and avoid WORK_AROUND_GCC w/3.8+

this results in a 15-20% speedup for lossy decoding on a N5/S6/CM1

BUG=webp:339

Change-Id: Icdeb84c3e0b8908147ac276b4d8f76c3d565b735
(cherry picked from commit f78da3dea6b2e02974a647122e96777667875d21)
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 65140e3..457db24 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -38,6 +38,15 @@
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
+#if defined(__clang__)
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif
diff --git a/src/dsp/neon.h b/src/dsp/neon.h
index 1c181c8..67554b1 100644
--- a/src/dsp/neon.h
+++ b/src/dsp/neon.h
@@ -17,9 +17,9 @@
 #include "./dsp.h"
 
 // Right now, some intrinsics functions seem slower, so we disable them
-// everywhere except newer gcc or aarch64 where the inline assembly is
+// everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
-#if LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
+#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif
 
@@ -44,7 +44,7 @@
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
 #define WORK_AROUND_GCC
 #endif