[smooth] Faster bitmap sweeping.

Selecting the fill rule or checking the direct mode each time we call
`gray_hline' is sub-optimal.  This effectively splits the direct mode
into a separate code path while inlining `gray_hline' and saving 5-7%
of rendering time.

* src/smooth/ftgrays.c (gray_hline): Eliminated in favor of...
(FT_FILL_RULE, FT_GRAY_SET): ... these new macros...
(gray_sweep): ... inlined here.
(gray_sweep_direct): New function that handles the direct span buffer.
(gray_TWorker): Remove the span buffer.
(gray_raster_render, gray_convert_glyph): Updated.
diff --git a/ChangeLog b/ChangeLog
index a8542d2..d482631 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2021-05-07  Alexei Podtelezhnikov  <apodtele@gmail.com>
+
+	[smooth] Faster bitmap sweeping.
+
+	Selecting the fill rule or checking the direct mode each time we call
+	`gray_hline' is sub-optimal.  This effectively splits the direct mode
+	into a separate code path while inlining `gray_hline' and saving 5-7%
+	of rendering time.
+
+	* src/smooth/ftgrays.c (gray_hline): Eliminated in favor of...
+	(FT_FILL_RULE, FT_GRAY_SET): ... these new macros...
+	(gray_sweep): ... inlined here.
+	(gray_sweep_direct): New function that handles the direct span buffer.
+	(gray_TWorker): Remove the span buffer.
+	(gray_raster_render, gray_convert_glyph): Updated.
+
 2021-05-10  Alexei Podtelezhnikov  <apodtele@gmail.com>
 
 	* src/smooth/ftgrays.c (gray_hline): Simplify even-odd computations.
diff --git a/src/smooth/ftgrays.c b/src/smooth/ftgrays.c
index 94169a0..a40877e 100644
--- a/src/smooth/ftgrays.c
+++ b/src/smooth/ftgrays.c
@@ -392,6 +392,42 @@
             ( sizeof( long ) * FT_CHAR_BIT - PIXEL_BITS ) )
 
 
+  /* Scale area and apply fill rule to calculate the coverage byte. */
+  /* The top fill bit is used for the non-zero rule. The eighth     */
+  /* fill bit is used for the even-odd rule.  The higher coverage   */
+  /* bytes are either clamped for the non-zero-rule or discarded    */
+  /* later for the even-odd rule.                                   */
+#define FT_FILL_RULE( coverage, area, fill )                \
+  FT_BEGIN_STMNT                                            \
+    coverage = (int)( area >> ( PIXEL_BITS * 2 + 1 - 8 ) ); \
+    if ( coverage & fill )                                  \
+      coverage = ~coverage;                                 \
+    if ( coverage > 255 && fill & INT_MIN )                 \
+      coverage = 255;                                       \
+  FT_END_STMNT
+
+
+  /* It is faster to write small spans byte-by-byte than calling     */
+  /* `memset'.  This is mainly due to the cost of the function call. */
+#define FT_GRAY_SET( d, s, count )           \
+  FT_BEGIN_STMNT                             \
+    unsigned char* q = d;                    \
+    unsigned char  c = (unsigned char)s;     \
+    switch ( count )                         \
+    {                                        \
+      case 7: *q++ = c; /* fall through */   \
+      case 6: *q++ = c; /* fall through */   \
+      case 5: *q++ = c; /* fall through */   \
+      case 4: *q++ = c; /* fall through */   \
+      case 3: *q++ = c; /* fall through */   \
+      case 2: *q++ = c; /* fall through */   \
+      case 1: *q   = c; /* fall through */   \
+      case 0: break;                         \
+      default: FT_MEM_SET( d, s, count );    \
+    }                                        \
+  FT_END_STMNT
+
+
   /**************************************************************************
    *
    * TYPE DEFINITIONS
@@ -463,8 +499,6 @@
 
     FT_Raster_Span_Func  render_span;
     void*                render_span_data;
-    FT_Span              spans[FT_MAX_GRAY_SPANS];
-    int                  num_spans;
 
   } gray_TWorker, *gray_PWorker;
 
@@ -1171,94 +1205,62 @@
 
 
   static void
-  gray_hline( RAS_ARG_ TCoord  x,
-                       TCoord  y,
-                       TArea   coverage,
-                       TCoord  acount )
+  gray_sweep( RAS_ARG )
   {
-    /* scale the coverage from 0..(ONE_PIXEL*ONE_PIXEL*2) to 0..256  */
-    coverage >>= PIXEL_BITS * 2 + 1 - 8;
+    int  fill = ras.outline.flags & FT_OUTLINE_EVEN_ODD_FILL ? 0x100 : INT_MIN;
+    int  coverage;
+    int  y;
 
-    /* compute the line's coverage depending on the outline fill rule */
-    if ( ras.outline.flags & FT_OUTLINE_EVEN_ODD_FILL )
+
+    for ( y = ras.min_ey; y < ras.max_ey; y++ )
     {
-      if ( coverage & 256 )  /* odd bit */
-        coverage = ~coverage;
+      PCell   cell  = ras.ycells[y - ras.min_ey];
+      TCoord  x     = ras.min_ex;
+      TArea   cover = 0;
+      TArea   area;
 
-      /* higher bits discarded below */
-    }
-    else  /* default non-zero winding rule */
-    {
-      if ( coverage < 0 )
-        coverage = ~coverage;  /* the same as -coverage - 1 */
-
-      if ( coverage >= 256 )
-        coverage = 255;
-    }
-
-    if ( ras.num_spans >= 0 )  /* for FT_RASTER_FLAG_DIRECT only */
-    {
-      FT_Span*  span = ras.spans + ras.num_spans++;
+      unsigned char*  line = ras.target.origin - ras.target.pitch * y;
 
 
-      span->x        = (short)x;
-      span->len      = (unsigned short)acount;
-      span->coverage = (unsigned char)coverage;
-
-      if ( ras.num_spans == FT_MAX_GRAY_SPANS )
+      for ( ; cell != NULL; cell = cell->next )
       {
-        /* flush the span buffer and reset the count */
-        ras.render_span( y, ras.num_spans, ras.spans, ras.render_span_data );
-        ras.num_spans = 0;
+        if ( cover != 0 && cell->x > x )
+        {
+          FT_FILL_RULE( coverage, cover, fill );
+          FT_GRAY_SET( line + x, coverage, cell->x - x );
+        }
+
+        cover += (TArea)cell->cover * ( ONE_PIXEL * 2 );
+        area   = cover - cell->area;
+
+        if ( area != 0 && cell->x >= ras.min_ex )
+        {
+          FT_FILL_RULE( coverage, area, fill );
+          line[cell->x] = (unsigned char)coverage;
+        }
+
+        x = cell->x + 1;
       }
-    }
-    else
-    {
-      unsigned char*  q = ras.target.origin - ras.target.pitch * y + x;
-      unsigned char   c = (unsigned char)coverage;
 
-
-      /* For small-spans it is faster to do it by ourselves than
-       * calling `memset'.  This is mainly due to the cost of the
-       * function call.
-       */
-      switch ( acount )
+      if ( cover != 0 )  /* only if cropped */
       {
-      case 7:
-        *q++ = c;
-        /* fall through */
-      case 6:
-        *q++ = c;
-        /* fall through */
-      case 5:
-        *q++ = c;
-        /* fall through */
-      case 4:
-        *q++ = c;
-        /* fall through */
-      case 3:
-        *q++ = c;
-        /* fall through */
-      case 2:
-        *q++ = c;
-        /* fall through */
-      case 1:
-        *q = c;
-        /* fall through */
-      case 0:
-        break;
-      default:
-        FT_MEM_SET( q, c, acount );
+        FT_FILL_RULE( coverage, cover, fill );
+        FT_GRAY_SET( line + x, coverage, ras.max_ex - x );
       }
     }
   }
 
 
   static void
-  gray_sweep( RAS_ARG )
+  gray_sweep_direct( RAS_ARG )
   {
+    int  fill = ras.outline.flags & FT_OUTLINE_EVEN_ODD_FILL ? 0x100 : INT_MIN;
+    int  coverage;
     int  y;
 
+    FT_Span  span[FT_MAX_GRAY_SPANS];
+    int      n = 0;
+
 
     for ( y = ras.min_ey; y < ras.max_ey; y++ )
     {
@@ -1271,25 +1273,59 @@
       for ( ; cell != NULL; cell = cell->next )
       {
         if ( cover != 0 && cell->x > x )
-          gray_hline( RAS_VAR_ x, y, cover, cell->x - x );
+        {
+          FT_FILL_RULE( coverage, cover, fill );
+
+          span[n].coverage = (unsigned char)coverage;
+          span[n].x        = (short)x;
+          span[n].len      = (unsigned short)( cell->x - x );
+
+          if ( ++n == FT_MAX_GRAY_SPANS )
+          {
+            /* flush the span buffer and reset the count */
+            ras.render_span( y, n, span, ras.render_span_data );
+            n = 0;
+          }
+        }
 
         cover += (TArea)cell->cover * ( ONE_PIXEL * 2 );
         area   = cover - cell->area;
 
         if ( area != 0 && cell->x >= ras.min_ex )
-          gray_hline( RAS_VAR_ cell->x, y, area, 1 );
+        {
+          FT_FILL_RULE( coverage, area, fill );
+
+          span[n].coverage = (unsigned char)coverage;
+          span[n].x        = (short)cell->x;
+          span[n].len      = 1;
+
+          if ( ++n == FT_MAX_GRAY_SPANS )
+          {
+            /* flush the span buffer and reset the count */
+            ras.render_span( y, n, span, ras.render_span_data );
+            n = 0;
+          }
+        }
 
         x = cell->x + 1;
       }
 
-      if ( cover != 0 )
-        gray_hline( RAS_VAR_ x, y, cover, ras.max_ex - x );
+      if ( cover != 0 )  /* only if cropped */
+      {
+        FT_FILL_RULE( coverage, cover, fill );
 
-      if ( ras.num_spans > 0 )  /* for FT_RASTER_FLAG_DIRECT only */
+        span[n].coverage = (unsigned char)coverage;
+        span[n].x        = (short)x;
+        span[n].len      = (unsigned short)( ras.max_ex - x );
+
+        ++n;
+      }
+
+      if ( n )
       {
         /* flush the span buffer and reset the count */
-        ras.render_span( y, ras.num_spans, ras.spans, ras.render_span_data );
-        ras.num_spans = 0;
+        ras.render_span( y, n, span, ras.render_span_data );
+        n = 0;
       }
     }
   }
@@ -1688,7 +1724,10 @@
 
         if ( !error )
         {
-          gray_sweep( RAS_VAR );
+          if ( ras.render_span )  /* for FT_RASTER_FLAG_DIRECT only */
+            gray_sweep_direct( RAS_VAR );
+          else
+            gray_sweep( RAS_VAR );
           band--;
           continue;
         }
@@ -1757,7 +1796,6 @@
 
       ras.render_span      = (FT_Raster_Span_Func)params->gray_spans;
       ras.render_span_data = params->user;
-      ras.num_spans        = 0;
 
       ras.min_ex = params->clip_box.xMin;
       ras.min_ey = params->clip_box.yMin;
@@ -1787,7 +1825,6 @@
 
       ras.render_span      = (FT_Raster_Span_Func)NULL;
       ras.render_span_data = NULL;
-      ras.num_spans        = -1;  /* invalid */
 
       ras.min_ex = 0;
       ras.min_ey = 0;