gtk/roaring/roaring.c - external/github.com/GNOME/gtk - Git at Google

 /*
  * Amalgamated copy of CRoaring 0.2.66, modified for GTK to reduce compiler
  * warnings.
  *
  * Copyright 2016-2020 The CRoaring authors
  * Copyright 2020 Benjamin Otte
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  * SPDX-License-Identifier: Apache-2.0
  */

 #include "roaring.h"

 /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
 #ifdef DMALLOC
 #include "dmalloc.h"
 #endif

 /* begin file src/array_util.c */
 #include <assert.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>


 #ifdef USESSE4
 // used by intersect_vector16
 ALIGNED(0x1000)
 static const uint8_t shuffle_mask16[] = {
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
     8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
     8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
     0xFF, 0xFF, 0xFF, 0xFF, 12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    8,    9,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,   12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,   12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    10,   11,   12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    6,    7,    10,   11,   12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
     8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    6,    7,    8,    9,    10,   11,   12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    10,   11,
     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
     8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    10,   11,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    8,    9,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    6,    7,    8,    9,    10,   11,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    6,    7,    12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
     8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    6,    7,    8,    9,    12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    6,    7,    8,    9,    12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
     8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 10,   11,   12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     4,    5,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    4,    5,    6,    7,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     4,    5,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
     8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    2,    3,    8,    9,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   12,   13,
     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
     8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
     2,    3,    4,    5,    8,    9,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 6,    7,    8,    9,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0,    1,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
     6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
     4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 2,    3,    4,    5,
     6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
     0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
     12,   13,   14,   15};

 /**
  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
  * Optimized by D. Lemire on May 3rd 2013
  */
 int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
                            const uint16_t *__restrict__ B, size_t s_b,
                            uint16_t *C) {
     size_t count = 0;
     size_t i_a = 0, i_b = 0;
     const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
     const size_t st_a = (s_a / vectorlength) * vectorlength;
     const size_t st_b = (s_b / vectorlength) * vectorlength;
     __m128i v_a, v_b;
     if ((i_a < st_a) && (i_b < st_b)) {
         v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
         v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
         while ((A[i_a] == 0) || (B[i_b] == 0)) {
             const __m128i res_v = _mm_cmpestrm(
                 v_b, vectorlength, v_a, vectorlength,
                 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
             const int r = _mm_extract_epi32(res_v, 0);
             __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 + r);
             __m128i p = _mm_shuffle_epi8(v_a, sm16);
             _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
             count += _mm_popcnt_u32(r);
             const uint16_t a_max = A[i_a + vectorlength - 1];
             const uint16_t b_max = B[i_b + vectorlength - 1];
             if (a_max <= b_max) {
                 i_a += vectorlength;
                 if (i_a == st_a) break;
                 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
             }
             if (b_max <= a_max) {
                 i_b += vectorlength;
                 if (i_b == st_b) break;
                 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
             }
         }
         if ((i_a < st_a) && (i_b < st_b))
             while (true) {
                 const __m128i res_v = _mm_cmpistrm(
                     v_b, v_a,
                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
                 const int r = _mm_extract_epi32(res_v, 0);
                 __m128i sm16 =
                     _mm_load_si128((const __m128i *)shuffle_mask16 + r);
                 __m128i p = _mm_shuffle_epi8(v_a, sm16);
                 _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
                 count += _mm_popcnt_u32(r);
                 const uint16_t a_max = A[i_a + vectorlength - 1];
                 const uint16_t b_max = B[i_b + vectorlength - 1];
                 if (a_max <= b_max) {
                     i_a += vectorlength;
                     if (i_a == st_a) break;
                     v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
                 }
                 if (b_max <= a_max) {
                     i_b += vectorlength;
                     if (i_b == st_b) break;
                     v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
                 }
             }
     }
     // intersect the tail using scalar intersection
     while (i_a < s_a && i_b < s_b) {
         uint16_t a = A[i_a];
         uint16_t b = B[i_b];
         if (a < b) {
             i_a++;
         } else if (b < a) {
             i_b++;
         } else {
             C[count] = a;  //==b;
             count++;
             i_a++;
             i_b++;
         }
     }
     return (int32_t)count;
 }

 int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
                                        size_t s_a,
                                        const uint16_t *__restrict__ B,
                                        size_t s_b) {
     size_t count = 0;
     size_t i_a = 0, i_b = 0;
     const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
     const size_t st_a = (s_a / vectorlength) * vectorlength;
     const size_t st_b = (s_b / vectorlength) * vectorlength;
     __m128i v_a, v_b;
     if ((i_a < st_a) && (i_b < st_b)) {
         v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
         v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
         while ((A[i_a] == 0) || (B[i_b] == 0)) {
             const __m128i res_v = _mm_cmpestrm(
                 v_b, vectorlength, v_a, vectorlength,
                 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
             const int r = _mm_extract_epi32(res_v, 0);
             count += _mm_popcnt_u32(r);
             const uint16_t a_max = A[i_a + vectorlength - 1];
             const uint16_t b_max = B[i_b + vectorlength - 1];
             if (a_max <= b_max) {
                 i_a += vectorlength;
                 if (i_a == st_a) break;
                 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
             }
             if (b_max <= a_max) {
                 i_b += vectorlength;
                 if (i_b == st_b) break;
                 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
             }
         }
         if ((i_a < st_a) && (i_b < st_b))
             while (true) {
                 const __m128i res_v = _mm_cmpistrm(
                     v_b, v_a,
                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
                 const int r = _mm_extract_epi32(res_v, 0);
                 count += _mm_popcnt_u32(r);
                 const uint16_t a_max = A[i_a + vectorlength - 1];
                 const uint16_t b_max = B[i_b + vectorlength - 1];
                 if (a_max <= b_max) {
                     i_a += vectorlength;
                     if (i_a == st_a) break;
                     v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
                 }
                 if (b_max <= a_max) {
                     i_b += vectorlength;
                     if (i_b == st_b) break;
                     v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
                 }
             }
     }
     // intersect the tail using scalar intersection
     while (i_a < s_a && i_b < s_b) {
         uint16_t a = A[i_a];
         uint16_t b = B[i_b];
         if (a < b) {
             i_a++;
         } else if (b < a) {
             i_b++;
         } else {
             count++;
             i_a++;
             i_b++;
         }
     }
     return (int32_t)count;
 }

 /////////
 // Warning:
 // This function may not be safe if A == C or B == C.
 /////////
 int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
                             const uint16_t *__restrict__ B, size_t s_b,
                             uint16_t *C) {
     // we handle the degenerate case
     if (s_a == 0) return 0;
     if (s_b == 0) {
         if (A != C) memcpy(C, A, sizeof(uint16_t) * s_a);
         return (int32_t)s_a;
     }
     // handle the leading zeroes, it is messy but it allows us to use the fast
     // _mm_cmpistrm intrinsic safely
     int32_t count = 0;
     if ((A[0] == 0) || (B[0] == 0)) {
         if ((A[0] == 0) && (B[0] == 0)) {
             A++;
             s_a--;
             B++;
             s_b--;
         } else if (A[0] == 0) {
             C[count++] = 0;
             A++;
             s_a--;
         } else {
             B++;
             s_b--;
         }
     }
     // at this point, we have two non-empty arrays, made of non-zero
     // increasing values.
     size_t i_a = 0, i_b = 0;
     const size_t vectorlength = sizeof(__m128i) / sizeof(uint16_t);
     const size_t st_a = (s_a / vectorlength) * vectorlength;
     const size_t st_b = (s_b / vectorlength) * vectorlength;
     if ((i_a < st_a) && (i_b < st_b)) {  // this is the vectorized code path
         __m128i v_a, v_b;                //, v_bmax;
         // we load a vector from A and a vector from B
         v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
         v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
         // we have a runningmask which indicates which values from A have been
         // spotted in B, these don't get written out.
         __m128i runningmask_a_found_in_b = _mm_setzero_si128();
         /****
         * start of the main vectorized loop
         *****/
         while (true) {
             // afoundinb will contain a mask indicate for each entry in A
             // whether it is seen
             // in B
             const __m128i a_found_in_b =
                 _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
                                            _SIDD_BIT_MASK);
             runningmask_a_found_in_b =
                 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
             // we always compare the last values of A and B
             const uint16_t a_max = A[i_a + vectorlength - 1];
             const uint16_t b_max = B[i_b + vectorlength - 1];
             if (a_max <= b_max) {
                 // Ok. In this code path, we are ready to write our v_a
                 // because there is no need to read more from B, they will
                 // all be large values.
                 const int bitmask_belongs_to_difference =
                     _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
                 /*** next few lines are probably expensive *****/
                 __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
                                               bitmask_belongs_to_difference);
                 __m128i p = _mm_shuffle_epi8(v_a, sm16);
                 _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
                 count += _mm_popcnt_u32(bitmask_belongs_to_difference);
                 // we advance a
                 i_a += vectorlength;
                 if (i_a == st_a)  // no more
                     break;
                 runningmask_a_found_in_b = _mm_setzero_si128();
                 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
             }
             if (b_max <= a_max) {
                 // in this code path, the current v_b has become useless
                 i_b += vectorlength;
                 if (i_b == st_b) break;
                 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
             }
         }
         // at this point, either we have i_a == st_a, which is the end of the
         // vectorized processing,
         // or we have i_b == st_b,  and we are not done processing the vector...
         // so we need to finish it off.
         if (i_a < st_a) {        // we have unfinished business...
             uint16_t buffer[8];  // buffer to do a masked load
             memset(buffer, 0, 8 * sizeof(uint16_t));
             memcpy(buffer, B + i_b, (s_b - i_b) * sizeof(uint16_t));
             v_b = _mm_lddqu_si128((__m128i *)buffer);
             const __m128i a_found_in_b =
                 _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
                                            _SIDD_BIT_MASK);
             runningmask_a_found_in_b =
                 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
             const int bitmask_belongs_to_difference =
                 _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
             __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
                                           bitmask_belongs_to_difference);
             __m128i p = _mm_shuffle_epi8(v_a, sm16);
             _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
             count += _mm_popcnt_u32(bitmask_belongs_to_difference);
             i_a += vectorlength;
         }
         // at this point we should have i_a == st_a and i_b == st_b
     }
     // do the tail using scalar code
     while (i_a < s_a && i_b < s_b) {
         uint16_t a = A[i_a];
         uint16_t b = B[i_b];
         if (b < a) {
             i_b++;
         } else if (a < b) {
             C[count] = a;
             count++;
             i_a++;
         } else {  //==
             i_a++;
             i_b++;
         }
     }
     if (i_a < s_a) {
         if(C == A) {
           assert((size_t)count <= i_a);
           if((size_t)count < i_a) {
             memmove(C + count, A + i_a, sizeof(uint16_t) * (s_a - i_a));
           }
         } else {
            for(size_t i = 0; i < (s_a - i_a); i++) {
                 C[count + i] = A[i + i_a];
            }
         }
         count += (int32_t)(s_a - i_a);
     }
     return count;
 }

 #endif  // USESSE4


 #ifdef USE_OLD_SKEW_INTERSECT
 // TODO: given enough experience with the new skew intersect, drop the old one from the code base.


 /* Computes the intersection between one small and one large set of uint16_t.
  * Stores the result into buffer and return the number of elements. */
 int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
                                 const uint16_t *large, size_t size_l,
                                 uint16_t *buffer) {
     size_t pos = 0, idx_l = 0, idx_s = 0;

     if (0 == size_s) {
         return 0;
     }

     uint16_t val_l = large[idx_l], val_s = small[idx_s];

     while (true) {
         if (val_l < val_s) {
             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
             if (idx_l == size_l) break;
             val_l = large[idx_l];
         } else if (val_s < val_l) {
             idx_s++;
             if (idx_s == size_s) break;
             val_s = small[idx_s];
         } else {
             buffer[pos++] = val_s;
             idx_s++;
             if (idx_s == size_s) break;
             val_s = small[idx_s];
             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
             if (idx_l == size_l) break;
             val_l = large[idx_l];
         }
     }

     return (int32_t)pos;
 }
 #else // USE_OLD_SKEW_INTERSECT


 /**
 * Branchless binary search going after 4 values at once.
 * Assumes that array is sorted.
 * You have that array[*index1] >= target1, array[*index12] >= target2, ...
 * except when *index1 = n, in which case you know that all values in array are
 * smaller than target1, and so forth.
 * It has logarithmic complexity.
 */
 static void binarySearch4(const uint16_t *array, int32_t n, uint16_t target1,
                    uint16_t target2, uint16_t target3, uint16_t target4,
                    int32_t *index1, int32_t *index2, int32_t *index3,
                    int32_t *index4) {
   const uint16_t *base1 = array;
   const uint16_t *base2 = array;
   const uint16_t *base3 = array;
   const uint16_t *base4 = array;
   if (n == 0)
     return;
   while (n > 1) {
     int32_t half = n >> 1;
     base1 = (base1[half] < target1) ? &base1[half] : base1;
     base2 = (base2[half] < target2) ? &base2[half] : base2;
     base3 = (base3[half] < target3) ? &base3[half] : base3;
     base4 = (base4[half] < target4) ? &base4[half] : base4;
     n -= half;
   }
   *index1 = (int32_t)((*base1 < target1) + base1 - array);
   *index2 = (int32_t)((*base2 < target2) + base2 - array);
   *index3 = (int32_t)((*base3 < target3) + base3 - array);
   *index4 = (int32_t)((*base4 < target4) + base4 - array);
 }

 /**
 * Branchless binary search going after 2 values at once.
 * Assumes that array is sorted.
 * You have that array[*index1] >= target1, array[*index12] >= target2.
 * except when *index1 = n, in which case you know that all values in array are
 * smaller than target1, and so forth.
 * It has logarithmic complexity.
 */
 static void binarySearch2(const uint16_t *array, int32_t n, uint16_t target1,
                    uint16_t target2, int32_t *index1, int32_t *index2) {
   const uint16_t *base1 = array;
   const uint16_t *base2 = array;
   if (n == 0)
     return;
   while (n > 1) {
     int32_t half = n >> 1;
     base1 = (base1[half] < target1) ? &base1[half] : base1;
     base2 = (base2[half] < target2) ? &base2[half] : base2;
     n -= half;
   }
   *index1 = (int32_t)((*base1 < target1) + base1 - array);
   *index2 = (int32_t)((*base2 < target2) + base2 - array);
 }

 /* Computes the intersection between one small and one large set of uint16_t.
  * Stores the result into buffer and return the number of elements.
  * Processes the small set in blocks of 4 values calling binarySearch4
  * and binarySearch2. This approach can be slightly superior to a conventional
  * galloping search in some instances.
  */
 int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
                                          const uint16_t *large, size_t size_l,
                                          uint16_t *buffer) {
   size_t pos = 0, idx_l = 0, idx_s = 0;

   if (0 == size_s) {
     return 0;
   }
   int32_t index1 = 0, index2 = 0, index3 = 0, index4 = 0;
   while ((idx_s + 4 <= size_s) && (idx_l < size_l)) {
     uint16_t target1 = small[idx_s];
     uint16_t target2 = small[idx_s + 1];
     uint16_t target3 = small[idx_s + 2];
     uint16_t target4 = small[idx_s + 3];
     binarySearch4(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, target3,
                   target4, &index1, &index2, &index3, &index4);
     if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
       buffer[pos++] = target1;
     }
     if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
       buffer[pos++] = target2;
     }
     if ((index3 + idx_l < size_l) && (large[idx_l + index3] == target3)) {
       buffer[pos++] = target3;
     }
     if ((index4 + idx_l < size_l) && (large[idx_l + index4] == target4)) {
       buffer[pos++] = target4;
     }
     idx_s += 4;
     idx_l += index4;
   }
   if ((idx_s + 2 <= size_s) && (idx_l < size_l)) {
     uint16_t target1 = small[idx_s];
     uint16_t target2 = small[idx_s + 1];
     binarySearch2(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, &index1,
                   &index2);
     if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
       buffer[pos++] = target1;
     }
     if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
       buffer[pos++] = target2;
     }
     idx_s += 2;
     idx_l += index2;
   }
   if ((idx_s < size_s) && (idx_l < size_l)) {
     uint16_t val_s = small[idx_s];
     int32_t index = binarySearch(large + idx_l, (int32_t)(size_l - idx_l), val_s);
     if (index >= 0)
       buffer[pos++] = val_s;
   }
   return (int32_t)pos;
 }


 #endif //USE_OLD_SKEW_INTERSECT


 // TODO: this could be accelerated, possibly, by using binarySearch4 as above.
 int32_t intersect_skewed_uint16_cardinality(const uint16_t *small,
                                             size_t size_s,
                                             const uint16_t *large,
                                             size_t size_l) {
     size_t pos = 0, idx_l = 0, idx_s = 0;

     if (0 == size_s) {
         return 0;
     }

     uint16_t val_l = large[idx_l], val_s = small[idx_s];

     while (true) {
         if (val_l < val_s) {
             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
             if (idx_l == size_l) break;
             val_l = large[idx_l];
         } else if (val_s < val_l) {
             idx_s++;
             if (idx_s == size_s) break;
             val_s = small[idx_s];
         } else {
             pos++;
             idx_s++;
             if (idx_s == size_s) break;
             val_s = small[idx_s];
             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
             if (idx_l == size_l) break;
             val_l = large[idx_l];
         }
     }

     return (int32_t)pos;
 }

 bool intersect_skewed_uint16_nonempty(const uint16_t *small, size_t size_s,
                                 const uint16_t *large, size_t size_l) {
     size_t idx_l = 0, idx_s = 0;

     if (0 == size_s) {
         return false;
     }

     uint16_t val_l = large[idx_l], val_s = small[idx_s];

     while (true) {
         if (val_l < val_s) {
             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
             if (idx_l == size_l) break;
             val_l = large[idx_l];
         } else if (val_s < val_l) {
             idx_s++;
             if (idx_s == size_s) break;
             val_s = small[idx_s];
         } else {
             return true;
         }
     }

     return false;
 }

 /**
  * Generic intersection function.
  */
 int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
                          const uint16_t *B, const size_t lenB, uint16_t *out) {
     const uint16_t *initout = out;
     if (lenA == 0 || lenB == 0) return 0;
     const uint16_t *endA = A + lenA;
     const uint16_t *endB = B + lenB;

     while (1) {
         while (*A < *B) {
         SKIP_FIRST_COMPARE:
             if (++A == endA) return (int32_t)(out - initout);
         }
         while (*A > *B) {
             if (++B == endB) return (int32_t)(out - initout);
         }
         if (*A == *B) {
             *out++ = *A;
             if (++A == endA || ++B == endB) return (int32_t)(out - initout);
         } else {
             goto SKIP_FIRST_COMPARE;
         }
     }
     return (int32_t)(out - initout);  // NOTREACHED
 }

 int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
                                      const uint16_t *B, const size_t lenB) {
     int32_t answer = 0;
     if (lenA == 0 || lenB == 0) return 0;
     const uint16_t *endA = A + lenA;
     const uint16_t *endB = B + lenB;

     while (1) {
         while (*A < *B) {
         SKIP_FIRST_COMPARE:
             if (++A == endA) return answer;
         }
         while (*A > *B) {
             if (++B == endB) return answer;
         }
         if (*A == *B) {
             ++answer;
             if (++A == endA || ++B == endB) return answer;
         } else {
             goto SKIP_FIRST_COMPARE;
         }
     }
     return answer;  // NOTREACHED
 }


 bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
                          const uint16_t *B, const size_t lenB) {
     if (lenA == 0 || lenB == 0) return 0;
     const uint16_t *endA = A + lenA;
     const uint16_t *endB = B + lenB;

     while (1) {
         while (*A < *B) {
         SKIP_FIRST_COMPARE:
             if (++A == endA) return false;
         }
         while (*A > *B) {
             if (++B == endB) return false;
         }
         if (*A == *B) {
             return true;
         } else {
             goto SKIP_FIRST_COMPARE;
         }
     }
     return false;  // NOTREACHED
 }


 /**
  * Generic intersection function.
  */
 size_t intersection_uint32(const uint32_t *A, const size_t lenA,
                            const uint32_t *B, const size_t lenB,
                            uint32_t *out) {
     const uint32_t *initout = out;
     if (lenA == 0 || lenB == 0) return 0;
     const uint32_t *endA = A + lenA;
     const uint32_t *endB = B + lenB;

     while (1) {
         while (*A < *B) {
         SKIP_FIRST_COMPARE:
             if (++A == endA) return (out - initout);
         }
         while (*A > *B) {
             if (++B == endB) return (out - initout);
         }
         if (*A == *B) {
             *out++ = *A;
             if (++A == endA || ++B == endB) return (out - initout);
         } else {
             goto SKIP_FIRST_COMPARE;
         }
     }
     return (out - initout);  // NOTREACHED
 }

 size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
                                 const uint32_t *B, const size_t lenB) {
     if (lenA == 0 || lenB == 0) return 0;
     size_t card = 0;
     const uint32_t *endA = A + lenA;
     const uint32_t *endB = B + lenB;

     while (1) {
         while (*A < *B) {
         SKIP_FIRST_COMPARE:
             if (++A == endA) return card;
         }
         while (*A > *B) {
             if (++B == endB) return card;
         }
         if (*A == *B) {
             card++;
             if (++A == endA || ++B == endB) return card;
         } else {
             goto SKIP_FIRST_COMPARE;
         }
     }
     return card;  // NOTREACHED
 }

 // can one vectorize the computation of the union? (Update: Yes! See
 // union_vector16).

 size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
                     size_t size_2, uint16_t *buffer) {
     size_t pos = 0, idx_1 = 0, idx_2 = 0;

     if (0 == size_2) {
         memmove(buffer, set_1, size_1 * sizeof(uint16_t));
         return size_1;
     }
     if (0 == size_1) {
         memmove(buffer, set_2, size_2 * sizeof(uint16_t));
         return size_2;
     }

     uint16_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];

     while (true) {
         if (val_1 < val_2) {
             buffer[pos++] = val_1;
             ++idx_1;
             if (idx_1 >= size_1) break;
             val_1 = set_1[idx_1];
         } else if (val_2 < val_1) {
             buffer[pos++] = val_2;
             ++idx_2;
             if (idx_2 >= size_2) break;
             val_2 = set_2[idx_2];
         } else {
             buffer[pos++] = val_1;
             ++idx_1;
             ++idx_2;
             if (idx_1 >= size_1 || idx_2 >= size_2) break;
             val_1 = set_1[idx_1];
             val_2 = set_2[idx_2];
         }
     }

     if (idx_1 < size_1) {
         const size_t n_elems = size_1 - idx_1;
         memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint16_t));
         pos += n_elems;
     } else if (idx_2 < size_2) {
         const size_t n_elems = size_2 - idx_2;
         memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint16_t));
         pos += n_elems;
     }

     return pos;
 }

 int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
                       int length2, uint16_t *a_out) {
     int out_card = 0;
     int k1 = 0, k2 = 0;
     if (length1 == 0) return 0;
     if (length2 == 0) {
         if (a1 != a_out) memcpy(a_out, a1, sizeof(uint16_t) * length1);
         return length1;
     }
     uint16_t s1 = a1[k1];
     uint16_t s2 = a2[k2];
     while (true) {
         if (s1 < s2) {
             a_out[out_card++] = s1;
             ++k1;
             if (k1 >= length1) {
                 break;
             }
             s1 = a1[k1];
         } else if (s1 == s2) {
             ++k1;
             ++k2;
             if (k1 >= length1) {
                 break;
             }
             if (k2 >= length2) {
                 memmove(a_out + out_card, a1 + k1,
                         sizeof(uint16_t) * (length1 - k1));
                 return out_card + length1 - k1;
             }
             s1 = a1[k1];
             s2 = a2[k2];
         } else {  // if (val1>val2)
             ++k2;
             if (k2 >= length2) {
                 memmove(a_out + out_card, a1 + k1,
                         sizeof(uint16_t) * (length1 - k1));
                 return out_card + length1 - k1;
             }
             s2 = a2[k2];
         }
     }
     return out_card;
 }

 int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
                    const uint16_t *array_2, int32_t card_2, uint16_t *out) {
     int32_t pos1 = 0, pos2 = 0, pos_out = 0;
     while (pos1 < card_1 && pos2 < card_2) {
         const uint16_t v1 = array_1[pos1];
         const uint16_t v2 = array_2[pos2];
         if (v1 == v2) {
             ++pos1;
             ++pos2;
             continue;
         }
         if (v1 < v2) {
             out[pos_out++] = v1;
             ++pos1;
         } else {
             out[pos_out++] = v2;
             ++pos2;
         }
     }
     if (pos1 < card_1) {
         const size_t n_elems = card_1 - pos1;
         memcpy(out + pos_out, array_1 + pos1, n_elems * sizeof(uint16_t));
         pos_out += (int32_t)n_elems;
     } else if (pos2 < card_2) {
         const size_t n_elems = card_2 - pos2;
         memcpy(out + pos_out, array_2 + pos2, n_elems * sizeof(uint16_t));
         pos_out += (int32_t)n_elems;
     }
     return pos_out;
 }

 #ifdef USESSE4

 /***
  * start of the SIMD 16-bit union code
  *
  */

 // Assuming that vInput1 and vInput2 are sorted, produces a sorted output going
 // from vecMin all the way to vecMax
 // developed originally for merge sort using SIMD instructions.
 // Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly
 // Algorithm for Sorting an Array of Structures
 static inline void sse_merge(const __m128i *vInput1,
                              const __m128i *vInput2,              // input 1 & 2
                              __m128i *vecMin, __m128i *vecMax) {  // output
     __m128i vecTmp;
     vecTmp = _mm_min_epu16(*vInput1, *vInput2);
     *vecMax = _mm_max_epu16(*vInput1, *vInput2);
     vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2);
 }

 // used by store_unique, generated by simdunion.py
 static uint8_t uniqshuf[] = {
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
     0xc,  0xd,  0xe,  0xf,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
     0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
     0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xe,  0xf,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xe,  0xf,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
     0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xc,  0xd,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xc,  0xd,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
     0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
     0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
     0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x2,  0x3,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x1,  0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF};

 // write vector new, while omitting repeated values assuming that previously
 // written vector was "old"
 static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) {
     __m128i vecTmp = _mm_alignr_epi8(newval, old, 16 - 2);
     // lots of high latency instructions follow (optimize?)
     int M = _mm_movemask_epi8(
         _mm_packs_epi16(_mm_cmpeq_epi16(vecTmp, newval), _mm_setzero_si128()));
     int numberofnewvalues = 8 - _mm_popcnt_u32(M);
     __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
     __m128i val = _mm_shuffle_epi8(newval, key);
     _mm_storeu_si128((__m128i *)output, val);
     return numberofnewvalues;
 }

 // working in-place, this function overwrites the repeated values
 // could be avoided?
 static inline uint32_t unique(uint16_t *out, uint32_t len) {
     uint32_t pos = 1;
     for (uint32_t i = 1; i < len; ++i) {
         if (out[i] != out[i - 1]) {
             out[pos++] = out[i];
         }
     }
     return pos;
 }

 // use with qsort, could be avoided
 static int uint16_compare(const void *a, const void *b) {
     return (*(uint16_t *)a - *(uint16_t *)b);
 }

 // a one-pass SSE union algorithm
 // This function may not be safe if array1 == output or array2 == output.
 uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
                         const uint16_t *__restrict__ array2, uint32_t length2,
                         uint16_t *__restrict__ output) {
     if ((length1 < 8) || (length2 < 8)) {
         return (uint32_t)union_uint16(array1, length1, array2, length2, output);
     }
     __m128i vA, vB, V, vecMin, vecMax;
     __m128i laststore;
     uint16_t *initoutput = output;
     uint32_t len1 = length1 / 8;
     uint32_t len2 = length2 / 8;
     uint32_t pos1 = 0;
     uint32_t pos2 = 0;
     // we start the machine
     vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
     pos1++;
     vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
     pos2++;
     sse_merge(&vA, &vB, &vecMin, &vecMax);
     laststore = _mm_set1_epi16(-1);
     output += store_unique(laststore, vecMin, output);
     laststore = vecMin;
     if ((pos1 < len1) && (pos2 < len2)) {
         uint16_t curA, curB;
         curA = array1[8 * pos1];
         curB = array2[8 * pos2];
         while (true) {
             if (curA <= curB) {
                 V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
                 pos1++;
                 if (pos1 < len1) {
                     curA = array1[8 * pos1];
                 } else {
                     break;
                 }
             } else {
                 V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
                 pos2++;
                 if (pos2 < len2) {
                     curB = array2[8 * pos2];
                 } else {
                     break;
                 }
             }
             sse_merge(&V, &vecMax, &vecMin, &vecMax);
             output += store_unique(laststore, vecMin, output);
             laststore = vecMin;
         }
         sse_merge(&V, &vecMax, &vecMin, &vecMax);
         output += store_unique(laststore, vecMin, output);
         laststore = vecMin;
     }
     // we finish the rest off using a scalar algorithm
     // could be improved?
     //
     // copy the small end on a tmp buffer
     uint32_t len = (uint32_t)(output - initoutput);
     uint16_t buffer[16];
     uint32_t leftoversize = store_unique(laststore, vecMax, buffer);
     if (pos1 == len1) {
         memcpy(buffer + leftoversize, array1 + 8 * pos1,
                (length1 - 8 * len1) * sizeof(uint16_t));
         leftoversize += length1 - 8 * len1;
         qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);

         leftoversize = unique(buffer, leftoversize);
         len += (uint32_t)union_uint16(buffer, leftoversize, array2 + 8 * pos2,
                                       length2 - 8 * pos2, output);
     } else {
         memcpy(buffer + leftoversize, array2 + 8 * pos2,
                (length2 - 8 * len2) * sizeof(uint16_t));
         leftoversize += length2 - 8 * len2;
         qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
         leftoversize = unique(buffer, leftoversize);
         len += (uint32_t)union_uint16(buffer, leftoversize, array1 + 8 * pos1,
                                       length1 - 8 * pos1, output);
     }
     return len;
 }

 /**
  * End of the SIMD 16-bit union code
  *
  */

 /**
  * Start of SIMD 16-bit XOR code
  */

 // write vector new, while omitting repeated values assuming that previously
 // written vector was "old"
 static inline int store_unique_xor(__m128i old, __m128i newval,
                                    uint16_t *output) {
     __m128i vecTmp1 = _mm_alignr_epi8(newval, old, 16 - 4);
     __m128i vecTmp2 = _mm_alignr_epi8(newval, old, 16 - 2);
     __m128i equalleft = _mm_cmpeq_epi16(vecTmp2, vecTmp1);
     __m128i equalright = _mm_cmpeq_epi16(vecTmp2, newval);
     __m128i equalleftoright = _mm_or_si128(equalleft, equalright);
     int M = _mm_movemask_epi8(
         _mm_packs_epi16(equalleftoright, _mm_setzero_si128()));
     int numberofnewvalues = 8 - _mm_popcnt_u32(M);
     __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
     __m128i val = _mm_shuffle_epi8(vecTmp2, key);
     _mm_storeu_si128((__m128i *)output, val);
     return numberofnewvalues;
 }

 // working in-place, this function overwrites the repeated values
 // could be avoided? Warning: assumes len > 0
 static inline uint32_t unique_xor(uint16_t *out, uint32_t len) {
     uint32_t pos = 1;
     for (uint32_t i = 1; i < len; ++i) {
         if (out[i] != out[i - 1]) {
             out[pos++] = out[i];
         } else
             pos--;  // if it is identical to previous, delete it
     }
     return pos;
 }

 // a one-pass SSE xor algorithm
 uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
                       const uint16_t *__restrict__ array2, uint32_t length2,
                       uint16_t *__restrict__ output) {
     if ((length1 < 8) || (length2 < 8)) {
         return xor_uint16(array1, length1, array2, length2, output);
     }
     __m128i vA, vB, V, vecMin, vecMax;
     __m128i laststore;
     uint16_t *initoutput = output;
     uint32_t len1 = length1 / 8;
     uint32_t len2 = length2 / 8;
     uint32_t pos1 = 0;
     uint32_t pos2 = 0;
     // we start the machine
     vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
     pos1++;
     vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
     pos2++;
     sse_merge(&vA, &vB, &vecMin, &vecMax);
     laststore = _mm_set1_epi16(-1);
     uint16_t buffer[17];
     output += store_unique_xor(laststore, vecMin, output);

     laststore = vecMin;
     if ((pos1 < len1) && (pos2 < len2)) {
         uint16_t curA, curB;
         curA = array1[8 * pos1];
         curB = array2[8 * pos2];
         while (true) {
             if (curA <= curB) {
                 V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
                 pos1++;
                 if (pos1 < len1) {
                     curA = array1[8 * pos1];
                 } else {
                     break;
                 }
             } else {
                 V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
                 pos2++;
                 if (pos2 < len2) {
                     curB = array2[8 * pos2];
                 } else {
                     break;
                 }
             }
             sse_merge(&V, &vecMax, &vecMin, &vecMax);
             // conditionally stores the last value of laststore as well as all
             // but the
             // last value of vecMin
             output += store_unique_xor(laststore, vecMin, output);
             laststore = vecMin;
         }
         sse_merge(&V, &vecMax, &vecMin, &vecMax);
         // conditionally stores the last value of laststore as well as all but
         // the
         // last value of vecMin
         output += store_unique_xor(laststore, vecMin, output);
         laststore = vecMin;
     }
     uint32_t len = (uint32_t)(output - initoutput);

     // we finish the rest off using a scalar algorithm
     // could be improved?
     // conditionally stores the last value of laststore as well as all but the
     // last value of vecMax,
     // we store to "buffer"
     int leftoversize = store_unique_xor(laststore, vecMax, buffer);
     uint16_t vec7 = _mm_extract_epi16(vecMax, 7);
     uint16_t vec6 = _mm_extract_epi16(vecMax, 6);
     if (vec7 != vec6) buffer[leftoversize++] = vec7;
     if (pos1 == len1) {
         memcpy(buffer + leftoversize, array1 + 8 * pos1,
                (length1 - 8 * len1) * sizeof(uint16_t));
         leftoversize += length1 - 8 * len1;
         if (leftoversize == 0) {  // trivial case
             memcpy(output, array2 + 8 * pos2,
                    (length2 - 8 * pos2) * sizeof(uint16_t));
             len += (length2 - 8 * pos2);
         } else {
             qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
             leftoversize = unique_xor(buffer, leftoversize);
             len += xor_uint16(buffer, leftoversize, array2 + 8 * pos2,
                               length2 - 8 * pos2, output);
         }
     } else {
         memcpy(buffer + leftoversize, array2 + 8 * pos2,
                (length2 - 8 * len2) * sizeof(uint16_t));
         leftoversize += length2 - 8 * len2;
         if (leftoversize == 0) {  // trivial case
             memcpy(output, array1 + 8 * pos1,
                    (length1 - 8 * pos1) * sizeof(uint16_t));
             len += (length1 - 8 * pos1);
         } else {
             qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
             leftoversize = unique_xor(buffer, leftoversize);
             len += xor_uint16(buffer, leftoversize, array1 + 8 * pos1,
                               length1 - 8 * pos1, output);
         }
     }
     return len;
 }

 /**
  * End of SIMD 16-bit XOR code
  */

 #endif  // USESSE4

 size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
                     size_t size_2, uint32_t *buffer) {
     size_t pos = 0, idx_1 = 0, idx_2 = 0;

     if (0 == size_2) {
         memmove(buffer, set_1, size_1 * sizeof(uint32_t));
         return size_1;
     }
     if (0 == size_1) {
         memmove(buffer, set_2, size_2 * sizeof(uint32_t));
         return size_2;
     }

     uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];

     while (true) {
         if (val_1 < val_2) {
             buffer[pos++] = val_1;
             ++idx_1;
             if (idx_1 >= size_1) break;
             val_1 = set_1[idx_1];
         } else if (val_2 < val_1) {
             buffer[pos++] = val_2;
             ++idx_2;
             if (idx_2 >= size_2) break;
             val_2 = set_2[idx_2];
         } else {
             buffer[pos++] = val_1;
             ++idx_1;
             ++idx_2;
             if (idx_1 >= size_1 || idx_2 >= size_2) break;
             val_1 = set_1[idx_1];
             val_2 = set_2[idx_2];
         }
     }

     if (idx_1 < size_1) {
         const size_t n_elems = size_1 - idx_1;
         memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint32_t));
         pos += n_elems;
     } else if (idx_2 < size_2) {
         const size_t n_elems = size_2 - idx_2;
         memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint32_t));
         pos += n_elems;
     }

     return pos;
 }

 size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
                          const uint32_t *set_2, size_t size_2) {
     size_t pos = 0, idx_1 = 0, idx_2 = 0;

     if (0 == size_2) {
         return size_1;
     }
     if (0 == size_1) {
         return size_2;
     }

     uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];

     while (true) {
         if (val_1 < val_2) {
             ++idx_1;
             ++pos;
             if (idx_1 >= size_1) break;
             val_1 = set_1[idx_1];
         } else if (val_2 < val_1) {
             ++idx_2;
             ++pos;
             if (idx_2 >= size_2) break;
             val_2 = set_2[idx_2];
         } else {
             ++idx_1;
             ++idx_2;
             ++pos;
             if (idx_1 >= size_1 || idx_2 >= size_2) break;
             val_1 = set_1[idx_1];
             val_2 = set_2[idx_2];
         }
     }

     if (idx_1 < size_1) {
         const size_t n_elems = size_1 - idx_1;
         pos += n_elems;
     } else if (idx_2 < size_2) {
         const size_t n_elems = size_2 - idx_2;
         pos += n_elems;
     }
     return pos;
 }


 size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
                     size_t size_2, uint16_t *buffer) {
 #ifdef ROARING_VECTOR_OPERATIONS_ENABLED
     // compute union with smallest array first
     if (size_1 < size_2) {
         return union_vector16(set_1, (uint32_t)size_1,
                                           set_2, (uint32_t)size_2, buffer);
     } else {
         return union_vector16(set_2, (uint32_t)size_2,
                                           set_1, (uint32_t)size_1, buffer);
     }
 #else
     // compute union with smallest array first
     if (size_1 < size_2) {
         return union_uint16(
             set_1, size_1, set_2, size_2, buffer);
     } else {
         return union_uint16(
             set_2, size_2, set_1, size_1, buffer);
     }
 #endif
 }

 bool memequals(const void *s1, const void *s2, size_t n) {
     if (n == 0) {
         return true;
     }
 #ifdef USEAVX
     const uint8_t *ptr1 = (const uint8_t *)s1;
     const uint8_t *ptr2 = (const uint8_t *)s2;
     const uint8_t *end1 = ptr1 + n;
     const uint8_t *end8 = ptr1 + n/8*8;
     const uint8_t *end32 = ptr1 + n/32*32;

     while (ptr1 < end32) {
         __m256i r1 = _mm256_loadu_si256((const __m256i*)ptr1);
         __m256i r2 = _mm256_loadu_si256((const __m256i*)ptr2);
         int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
         if ((uint32_t)mask != UINT32_MAX) {
             return false;
         }
         ptr1 += 32;
         ptr2 += 32;
     }

     while (ptr1 < end8) {
         uint64_t v1 = *((const uint64_t*)ptr1);
         uint64_t v2 = *((const uint64_t*)ptr2);
         if (v1 != v2) {
             return false;
         }
         ptr1 += 8;
         ptr2 += 8;
     }

     while (ptr1 < end1) {
         if (*ptr1 != *ptr2) {
             return false;
         }
         ptr1++;
         ptr2++;
     }

     return true;
 #else
     return memcmp(s1, s2, n) == 0;
 #endif
 }
 /* end file src/array_util.c */
 /* begin file src/bitset_util.c */
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>


 #ifdef IS_X64
 static uint8_t lengthTable[256] = {
     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
     2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
     2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
     4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
     3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
     4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
 #endif

 #ifdef USEAVX
 ALIGNED(32)
 static uint32_t vecDecodeTable[256][8] = {
     {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
     {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
     {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
     {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
     {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
     {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
     {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
     {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
     {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
     {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
     {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
     {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
     {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
     {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
     {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
     {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
     {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
     {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
     {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
     {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
     {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
     {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
     {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
     {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
     {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
     {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
     {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
     {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
     {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
     {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
     {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
     {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
     {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
     {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
     {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
     {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
     {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
     {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
     {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
     {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
     {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
     {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
     {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
     {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
     {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
     {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
     {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
     {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
     {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
     {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
     {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
     {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
     {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
     {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
     {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
     {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
     {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
     {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
     {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
     {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
     {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
     {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
     {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
     {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
     {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
     {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
     {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
     {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
     {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
     {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
     {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
     {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
     {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
     {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
     {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
     {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
     {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
     {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
     {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
     {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
     {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
     {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
     {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
     {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
     {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
     {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
     {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
     {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
     {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
     {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
     {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
     {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
     {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
     {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
     {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
     {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
     {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
     {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
     {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
     {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
     {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
     {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
     {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
     {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
     {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
     {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
     {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
     {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
     {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
     {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
     {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
     {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
     {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
     {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
     {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
     {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
     {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
     {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
     {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
     {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
     {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
     {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
     {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
     {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
     {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
     {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
     {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
     {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
     {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
     {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
     {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
     {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
     {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
     {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
     {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
     {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
     {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
     {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
     {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
     {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
     {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
     {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
     {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
     {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
     {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
     {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
     {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
     {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
     {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
     {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
     {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
     {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
     {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
     {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
     {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
     {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
     {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
     {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
     {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
     {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
     {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
     {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
     {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
     {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
     {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
     {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
     {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
     {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
     {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
     {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
     {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
     {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
     {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
     {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
     {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
     {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
     {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
     {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
     {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
     {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
     {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
     {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
     {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
     {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
     {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
     {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
     {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
     {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
     {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
     {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
     {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
     {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
     {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
     {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
     {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
     {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
     {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
     {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
     {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
     {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
     {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
     {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
     {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
     {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
     {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
     {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
     {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
     {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
     {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
     {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
     {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
     {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
     {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
     {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
     {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
     {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
     {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
     {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
     {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
     {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
     {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
     {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
     {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
     {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
     {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
     {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
     {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
     {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
     {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
     {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
     {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
     {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
     {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
     {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
     {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
     {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
     {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
     {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
     {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
     {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
     {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
     {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
     {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
     {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
     {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
     {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
     {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
     {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
     {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
     {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
     {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
     {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
     {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
     {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
     {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
     {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
 };

 #endif  // #ifdef USEAVX

 #ifdef IS_X64
 // same as vecDecodeTable but in 16 bits
 ALIGNED(32)
 static uint16_t vecDecodeTable_uint16[256][8] = {
     {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
     {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
     {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
     {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
     {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
     {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
     {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
     {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
     {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
     {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
     {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
     {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
     {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
     {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
     {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
     {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
     {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
     {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
     {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
     {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
     {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
     {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
     {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
     {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
     {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
     {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
     {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
     {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
     {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
     {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
     {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
     {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
     {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
     {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
     {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
     {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
     {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
     {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
     {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
     {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
     {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
     {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
     {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
     {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
     {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
     {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
     {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
     {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
     {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
     {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
     {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
     {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
     {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
     {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
     {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
     {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
     {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
     {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
     {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
     {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
     {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
     {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
     {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
     {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
     {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
     {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
     {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
     {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
     {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
     {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
     {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
     {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
     {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
     {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
     {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
     {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
     {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
     {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
     {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
     {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
     {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
     {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
     {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
     {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
     {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
     {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
     {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
     {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
     {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
     {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
     {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
     {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
     {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
     {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
     {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
     {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
     {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
     {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
     {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
     {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
     {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
     {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
     {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
     {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
     {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
     {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
     {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
     {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
     {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
     {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
     {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
     {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
     {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
     {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
     {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
     {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
     {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
     {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
     {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
     {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
     {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
     {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
     {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
     {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
     {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
     {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
     {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
     {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
     {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
     {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
     {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
     {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
     {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
     {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
     {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
     {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
     {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
     {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
     {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
     {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
     {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
     {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
     {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
     {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
     {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
     {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
     {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
     {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
     {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
     {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
     {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
     {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
     {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
     {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
     {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
     {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
     {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
     {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
     {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
     {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
     {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
     {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
     {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
     {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
     {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
     {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
     {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
     {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
     {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
     {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
     {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
     {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
     {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
     {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
     {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
     {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
     {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
     {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
     {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
     {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
     {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
     {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
     {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
     {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
     {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
     {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
     {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
     {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
     {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
     {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
     {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
     {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
     {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
     {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
     {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
     {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
     {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
     {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
     {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
     {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
     {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
     {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
     {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
     {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
     {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
     {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
     {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
     {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
     {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
     {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
     {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
     {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
     {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
     {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
     {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
     {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
     {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
     {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
     {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
     {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
     {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
     {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
     {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
     {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
     {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
     {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
     {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
     {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
     {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
     {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
     {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
     {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
     {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
     {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
     {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
     {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
     {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
     {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
     {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
     {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
     {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
     {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
     {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
     {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
     {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
     {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
     {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
     {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
     {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
     {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
     {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
     {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
     {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
     {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
     {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
     {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
 };

 #endif

 #ifdef USEAVX

 size_t bitset_extract_setbits_avx2(uint64_t *array, size_t length, void *vout,
                                    size_t outcapacity, uint32_t base) {
     uint32_t *out = (uint32_t *)vout;
     uint32_t *initout = out;
     __m256i baseVec = _mm256_set1_epi32(base - 1);
     __m256i incVec = _mm256_set1_epi32(64);
     __m256i add8 = _mm256_set1_epi32(8);
     uint32_t *safeout = out + outcapacity;
     size_t i = 0;
     for (; (i < length) && (out + 64 <= safeout); ++i) {
         uint64_t w = array[i];
         if (w == 0) {
             baseVec = _mm256_add_epi32(baseVec, incVec);
         } else {
             for (int k = 0; k < 4; ++k) {
                 uint8_t byteA = (uint8_t)w;
                 uint8_t byteB = (uint8_t)(w >> 8);
                 w >>= 16;
                 __m256i vecA =
                     _mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
                 __m256i vecB =
                     _mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
                 uint8_t advanceA = lengthTable[byteA];
                 uint8_t advanceB = lengthTable[byteB];
                 vecA = _mm256_add_epi32(baseVec, vecA);
                 baseVec = _mm256_add_epi32(baseVec, add8);
                 vecB = _mm256_add_epi32(baseVec, vecB);
                 baseVec = _mm256_add_epi32(baseVec, add8);
                 _mm256_storeu_si256((__m256i *)out, vecA);
                 out += advanceA;
                 _mm256_storeu_si256((__m256i *)out, vecB);
                 out += advanceB;
             }
         }
     }
     base += i * 64;
     for (; (i < length) && (out < safeout); ++i) {
         uint64_t w = array[i];
         while ((w != 0) && (out < safeout)) {
             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
             int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
             uint32_t val = r + base;
             memcpy(out, &val,
                    sizeof(uint32_t));  // should be compiled as a MOV on x64
             out++;
             w ^= t;
         }
         base += 64;
     }
     return out - initout;
 }
 #endif  // USEAVX

 size_t bitset_extract_setbits(uint64_t *bitset, size_t length, void *vout,
                               uint32_t base) {
     int outpos = 0;
     uint32_t *out = (uint32_t *)vout;
     for (size_t i = 0; i < length; ++i) {
         uint64_t w = bitset[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
             int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
             uint32_t val = r + base;
             memcpy(out + outpos, &val,
                    sizeof(uint32_t));  // should be compiled as a MOV on x64
             outpos++;
             w ^= t;
         }
         base += 64;
     }
     return outpos;
 }

 size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,
                                                   const uint64_t * __restrict__ bitset2,
                                                   size_t length, uint16_t *out,
                                                   uint16_t base) {
     int outpos = 0;
     for (size_t i = 0; i < length; ++i) {
         uint64_t w = bitset1[i] & bitset2[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1);
             int r = __builtin_ctzll(w);
             out[outpos++] = r + base;
             w ^= t;
         }
         base += 64;
     }
     return outpos;
 }

 #ifdef IS_X64
 /*
  * Given a bitset containing "length" 64-bit words, write out the position
  * of all the set bits to "out" as 16-bit integers, values start at "base" (can
  *be set to zero).
  *
  * The "out" pointer should be sufficient to store the actual number of bits
  *set.
  *
  * Returns how many values were actually decoded.
  *
  * This function uses SSE decoding.
  */
 size_t bitset_extract_setbits_sse_uint16(const uint64_t *bitset, size_t length,
                                          uint16_t *out, size_t outcapacity,
                                          uint16_t base) {
     uint16_t *initout = out;
     __m128i baseVec = _mm_set1_epi16(base - 1);
     __m128i incVec = _mm_set1_epi16(64);
     __m128i add8 = _mm_set1_epi16(8);
     uint16_t *safeout = out + outcapacity;
     const int numberofbytes = 2;  // process two bytes at a time
     size_t i = 0;
     for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) {
         uint64_t w = bitset[i];
         if (w == 0) {
             baseVec = _mm_add_epi16(baseVec, incVec);
         } else {
             for (int k = 0; k < 4; ++k) {
                 uint8_t byteA = (uint8_t)w;
                 uint8_t byteB = (uint8_t)(w >> 8);
                 w >>= 16;
                 __m128i vecA = _mm_load_si128(
                     (const __m128i *)vecDecodeTable_uint16[byteA]);
                 __m128i vecB = _mm_load_si128(
                     (const __m128i *)vecDecodeTable_uint16[byteB]);
                 uint8_t advanceA = lengthTable[byteA];
                 uint8_t advanceB = lengthTable[byteB];
                 vecA = _mm_add_epi16(baseVec, vecA);
                 baseVec = _mm_add_epi16(baseVec, add8);
                 vecB = _mm_add_epi16(baseVec, vecB);
                 baseVec = _mm_add_epi16(baseVec, add8);
                 _mm_storeu_si128((__m128i *)out, vecA);
                 out += advanceA;
                 _mm_storeu_si128((__m128i *)out, vecB);
                 out += advanceB;
             }
         }
     }
     base += (uint16_t)(i * 64);
     for (; (i < length) && (out < safeout); ++i) {
         uint64_t w = bitset[i];
         while ((w != 0) && (out < safeout)) {
             uint64_t t = w & (~w + 1);
             int r = __builtin_ctzll(w);
             *out = r + base;
             out++;
             w ^= t;
         }
         base += 64;
     }
     return out - initout;
 }
 #endif

 /*
  * Given a bitset containing "length" 64-bit words, write out the position
  * of all the set bits to "out", values start at "base" (can be set to zero).
  *
  * The "out" pointer should be sufficient to store the actual number of bits
  *set.
  *
  * Returns how many values were actually decoded.
  */
 size_t bitset_extract_setbits_uint16(const uint64_t *bitset, size_t length,
                                      uint16_t *out, uint16_t base) {
     int outpos = 0;
     for (size_t i = 0; i < length; ++i) {
         uint64_t w = bitset[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1);
             int r = __builtin_ctzll(w);
             out[outpos++] = r + base;
             w ^= t;
         }
         base += 64;
     }
     return outpos;
 }

 #if defined(ASMBITMANIPOPTIMIZATION)

 uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
                                   const uint16_t *list, uint64_t length) {
     uint64_t offset, load, pos;
     uint64_t shift = 6;
     const uint16_t *end = list + length;
     if (!length) return card;
     // TODO: could unroll for performance, see bitset_set_list
     // bts is not available as an intrinsic in GCC
     __asm volatile(
         "1:\n"
         "movzwq (%[list]), %[pos]\n"
         "shrx %[shift], %[pos], %[offset]\n"
         "mov (%[bitset],%[offset],8), %[load]\n"
         "bts %[pos], %[load]\n"
         "mov %[load], (%[bitset],%[offset],8)\n"
         "sbb $-1, %[card]\n"
         "add $2, %[list]\n"
         "cmp %[list], %[end]\n"
         "jnz 1b"
         : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
           [pos] "=&r"(pos), [offset] "=&r"(offset)
         : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift));
     return card;
 }

 void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
     uint64_t pos;
     const uint16_t *end = list + length;

     uint64_t shift = 6;
     uint64_t offset;
     uint64_t load;
     for (; list + 3 < end; list += 4) {
         pos = list[0];
         __asm volatile(
             "shrx %[shift], %[pos], %[offset]\n"
             "mov (%[bitset],%[offset],8), %[load]\n"
             "bts %[pos], %[load]\n"
             "mov %[load], (%[bitset],%[offset],8)"
             : [load] "=&r"(load), [offset] "=&r"(offset)
             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
         pos = list[1];
         __asm volatile(
             "shrx %[shift], %[pos], %[offset]\n"
             "mov (%[bitset],%[offset],8), %[load]\n"
             "bts %[pos], %[load]\n"
             "mov %[load], (%[bitset],%[offset],8)"
             : [load] "=&r"(load), [offset] "=&r"(offset)
             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
         pos = list[2];
         __asm volatile(
             "shrx %[shift], %[pos], %[offset]\n"
             "mov (%[bitset],%[offset],8), %[load]\n"
             "bts %[pos], %[load]\n"
             "mov %[load], (%[bitset],%[offset],8)"
             : [load] "=&r"(load), [offset] "=&r"(offset)
             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
         pos = list[3];
         __asm volatile(
             "shrx %[shift], %[pos], %[offset]\n"
             "mov (%[bitset],%[offset],8), %[load]\n"
             "bts %[pos], %[load]\n"
             "mov %[load], (%[bitset],%[offset],8)"
             : [load] "=&r"(load), [offset] "=&r"(offset)
             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
     }

     while (list != end) {
         pos = list[0];
         __asm volatile(
             "shrx %[shift], %[pos], %[offset]\n"
             "mov (%[bitset],%[offset],8), %[load]\n"
             "bts %[pos], %[load]\n"
             "mov %[load], (%[bitset],%[offset],8)"
             : [load] "=&r"(load), [offset] "=&r"(offset)
             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
         list++;
     }
 }

 uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
                            uint64_t length) {
     uint64_t offset, load, pos;
     uint64_t shift = 6;
     const uint16_t *end = list + length;
     if (!length) return card;
     // btr is not available as an intrinsic in GCC
     __asm volatile(
         "1:\n"
         "movzwq (%[list]), %[pos]\n"
         "shrx %[shift], %[pos], %[offset]\n"
         "mov (%[bitset],%[offset],8), %[load]\n"
         "btr %[pos], %[load]\n"
         "mov %[load], (%[bitset],%[offset],8)\n"
         "sbb $0, %[card]\n"
         "add $2, %[list]\n"
         "cmp %[list], %[end]\n"
         "jnz 1b"
         : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
           [pos] "=&r"(pos), [offset] "=&r"(offset)
         : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift)
         :
         /* clobbers */ "memory");
     return card;
 }

 #else
 uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
                            uint64_t length) {
     uint64_t offset, load, newload, pos, index;
     const uint16_t *end = list + length;
     while (list != end) {
         pos = *(const uint16_t *)list;
         offset = pos >> 6;
         index = pos % 64;
         load = ((uint64_t *)bitset)[offset];
         newload = load & ~(UINT64_C(1) << index);
         card -= (load ^ newload) >> index;
         ((uint64_t *)bitset)[offset] = newload;
         list++;
     }
     return card;
 }

 uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
                                   const uint16_t *list, uint64_t length) {
     uint64_t offset, load, newload, pos, index;
     const uint16_t *end = list + length;
     while (list != end) {
         pos = *(const uint16_t *)list;
         offset = pos >> 6;
         index = pos % 64;
         load = ((uint64_t *)bitset)[offset];
         newload = load | (UINT64_C(1) << index);
         card += (load ^ newload) >> index;
         ((uint64_t *)bitset)[offset] = newload;
         list++;
     }
     return card;
 }

 void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
     uint64_t offset, load, newload, pos, index;
     const uint16_t *end = list + length;
     while (list != end) {
         pos = *(const uint16_t *)list;
         offset = pos >> 6;
         index = pos % 64;
         load = ((uint64_t *)bitset)[offset];
         newload = load | (UINT64_C(1) << index);
         ((uint64_t *)bitset)[offset] = newload;
         list++;
     }
 }

 #endif

 /* flip specified bits */
 /* TODO: consider whether worthwhile to make an asm version */

 uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card,
                                    const uint16_t *list, uint64_t length) {
     uint64_t offset, load, newload, pos, index;
     const uint16_t *end = list + length;
     while (list != end) {
         pos = *(const uint16_t *)list;
         offset = pos >> 6;
         index = pos % 64;
         load = ((uint64_t *)bitset)[offset];
         newload = load ^ (UINT64_C(1) << index);
         // todo: is a branch here all that bad?
         card +=
             (1 - 2 * (((UINT64_C(1) << index) & load) >> index));  // +1 or -1
         ((uint64_t *)bitset)[offset] = newload;
         list++;
     }
     return card;
 }

 void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length) {
     uint64_t offset, load, newload, pos, index;
     const uint16_t *end = list + length;
     while (list != end) {
         pos = *(const uint16_t *)list;
         offset = pos >> 6;
         index = pos % 64;
         load = ((uint64_t *)bitset)[offset];
         newload = load ^ (UINT64_C(1) << index);
         ((uint64_t *)bitset)[offset] = newload;
         list++;
     }
 }
 /* end file src/bitset_util.c */
 /* begin file src/containers/array.c */
 /*
  * array.c
  *
  */

 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>

 /* Create a new array with capacity size. Return NULL in case of failure. */
 array_container_t *array_container_create_given_capacity(int32_t size) {
     array_container_t *container;

     container = (array_container_t *)malloc(sizeof(array_container_t));
     assert (container);

     if( size <= 0 ) { // we don't want to rely on malloc(0)
         container->array = NULL;
     } else {
         container->array = (uint16_t *)malloc(sizeof(uint16_t) * size);
         assert (container->array);
     }

     container->capacity = size;
     container->cardinality = 0;

     return container;
 }

 /* Create a new array. Return NULL in case of failure. */
 array_container_t *array_container_create(void) {
     return array_container_create_given_capacity(ARRAY_DEFAULT_INIT_SIZE);
 }

 /* Create a new array containing all values in [min,max). */
 array_container_t * array_container_create_range(uint32_t min, uint32_t max) {
     array_container_t * answer = array_container_create_given_capacity(max - min + 1);
     if(answer == NULL) return answer;
     answer->cardinality = 0;
     for(uint32_t k = min; k < max; k++) {
       answer->array[answer->cardinality++] = k;
     }
     return answer;
 }

 /* Duplicate container */
 array_container_t *array_container_clone(const array_container_t *src) {
     array_container_t *newcontainer =
         array_container_create_given_capacity(src->capacity);
     if (newcontainer == NULL) return NULL;

     newcontainer->cardinality = src->cardinality;

     memcpy(newcontainer->array, src->array,
            src->cardinality * sizeof(uint16_t));

     return newcontainer;
 }

 int array_container_shrink_to_fit(array_container_t *src) {
     if (src->cardinality == src->capacity) return 0;  // nothing to do
     int savings = src->capacity - src->cardinality;
     src->capacity = src->cardinality;
     if( src->capacity == 0) { // we do not want to rely on realloc for zero allocs
       free(src->array);
       src->array = NULL;
     } else {
       uint16_t *oldarray = src->array;
       src->array =
         (uint16_t *)realloc(oldarray, src->capacity * sizeof(uint16_t));
       if (src->array == NULL) free(oldarray);  // should never happen?
     }
     return savings;
 }

 /* Free memory. */
 void array_container_free(array_container_t *arr) {
     if(arr->array != NULL) {// Jon Strabala reports that some tools complain otherwise
       free(arr->array);
       arr->array = NULL; // pedantic
     }
     free(arr);
 }

 static inline int32_t grow_capacity(int32_t capacity) {
     return (capacity <= 0) ? ARRAY_DEFAULT_INIT_SIZE
                            : capacity < 64 ? capacity * 2
                                            : capacity < 1024 ? capacity * 3 / 2
                                                              : capacity * 5 / 4;
 }

 static inline int32_t clamp(int32_t val, int32_t min, int32_t max) {
     return ((val < min) ? min : (val > max) ? max : val);
 }

 void array_container_grow(array_container_t *container, int32_t min,
                           bool preserve) {

     int32_t max = (min <= DEFAULT_MAX_SIZE ? DEFAULT_MAX_SIZE : 65536);
     int32_t new_capacity = clamp(grow_capacity(container->capacity), min, max);

     container->capacity = new_capacity;
     uint16_t *array = container->array;

     if (preserve) {
         container->array =
             (uint16_t *)realloc(array, new_capacity * sizeof(uint16_t));
         if (container->array == NULL) free(array);
     } else {
         // Jon Strabala reports that some tools complain otherwise
         if (array != NULL) {
           free(array);
         }
         container->array = (uint16_t *)malloc(new_capacity * sizeof(uint16_t));
     }

     //  handle the case where realloc fails
     if (container->array == NULL) {
       fprintf(stderr, "could not allocate memory\n");
     }
     assert(container->array != NULL);
 }

 /* Copy one container into another. We assume that they are distinct. */
 void array_container_copy(const array_container_t *src,
                           array_container_t *dst) {
     const int32_t cardinality = src->cardinality;
     if (cardinality > dst->capacity) {
         array_container_grow(dst, cardinality, false);
     }

     dst->cardinality = cardinality;
     memcpy(dst->array, src->array, cardinality * sizeof(uint16_t));
 }

 void array_container_add_from_range(array_container_t *arr, uint32_t min,
                                     uint32_t max, uint16_t step) {
     for (uint32_t value = min; value < max; value += step) {
         array_container_append(arr, value);
     }
 }

 /* Computes the union of array1 and array2 and write the result to arrayout.
  * It is assumed that arrayout is distinct from both array1 and array2.
  */
 void array_container_union(const array_container_t *array_1,
                            const array_container_t *array_2,
                            array_container_t *out) {
     const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
     const int32_t max_cardinality = card_1 + card_2;

     if (out->capacity < max_cardinality) {
       array_container_grow(out, max_cardinality, false);
     }
     out->cardinality = (int32_t)fast_union_uint16(array_1->array, card_1,
                                       array_2->array, card_2, out->array);

 }

 /* Computes the  difference of array1 and array2 and write the result
  * to array out.
  * Array out does not need to be distinct from array_1
  */
 void array_container_andnot(const array_container_t *array_1,
                             const array_container_t *array_2,
                             array_container_t *out) {
     if (out->capacity < array_1->cardinality)
         array_container_grow(out, array_1->cardinality, false);
 #ifdef ROARING_VECTOR_OPERATIONS_ENABLED
     if((out != array_1) && (out != array_2)) {
       out->cardinality =
           difference_vector16(array_1->array, array_1->cardinality,
                             array_2->array, array_2->cardinality, out->array);
      } else {
       out->cardinality =
         difference_uint16(array_1->array, array_1->cardinality, array_2->array,
                           array_2->cardinality, out->array);
      }
 #else
     out->cardinality =
         difference_uint16(array_1->array, array_1->cardinality, array_2->array,
                           array_2->cardinality, out->array);
 #endif
 }

 /* Computes the symmetric difference of array1 and array2 and write the
  * result
  * to arrayout.
  * It is assumed that arrayout is distinct from both array1 and array2.
  */
 void array_container_xor(const array_container_t *array_1,
                          const array_container_t *array_2,
                          array_container_t *out) {
     const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
     const int32_t max_cardinality = card_1 + card_2;
     if (out->capacity < max_cardinality) {
         array_container_grow(out, max_cardinality, false);
     }

 #ifdef ROARING_VECTOR_OPERATIONS_ENABLED
     out->cardinality =
         xor_vector16(array_1->array, array_1->cardinality, array_2->array,
                      array_2->cardinality, out->array);
 #else
     out->cardinality =
         xor_uint16(array_1->array, array_1->cardinality, array_2->array,
                    array_2->cardinality, out->array);
 #endif
 }

 static inline int32_t minimum_int32(int32_t a, int32_t b) {
     return (a < b) ? a : b;
 }

 /* computes the intersection of array1 and array2 and write the result to
  * arrayout.
  * It is assumed that arrayout is distinct from both array1 and array2.
  * */
 void array_container_intersection(const array_container_t *array1,
                                   const array_container_t *array2,
                                   array_container_t *out) {
     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality,
             min_card = minimum_int32(card_1, card_2);
     const int threshold = 64;  // subject to tuning
 #ifdef USEAVX
     if (out->capacity < min_card) {
       array_container_grow(out, min_card + sizeof(__m128i) / sizeof(uint16_t),
         false);
     }
 #else
     if (out->capacity < min_card) {
       array_container_grow(out, min_card, false);
     }
 #endif

     if (card_1 * threshold < card_2) {
         out->cardinality = intersect_skewed_uint16(
             array1->array, card_1, array2->array, card_2, out->array);
     } else if (card_2 * threshold < card_1) {
         out->cardinality = intersect_skewed_uint16(
             array2->array, card_2, array1->array, card_1, out->array);
     } else {
 #ifdef USEAVX
         out->cardinality = intersect_vector16(
             array1->array, card_1, array2->array, card_2, out->array);
 #else
         out->cardinality = intersect_uint16(array1->array, card_1,
                                             array2->array, card_2, out->array);
 #endif
     }
 }

 /* computes the size of the intersection of array1 and array2
  * */
 int array_container_intersection_cardinality(const array_container_t *array1,
                                              const array_container_t *array2) {
     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
     const int threshold = 64;  // subject to tuning
     if (card_1 * threshold < card_2) {
         return intersect_skewed_uint16_cardinality(array1->array, card_1,
                                                    array2->array, card_2);
     } else if (card_2 * threshold < card_1) {
         return intersect_skewed_uint16_cardinality(array2->array, card_2,
                                                    array1->array, card_1);
     } else {
 #ifdef USEAVX
         return intersect_vector16_cardinality(array1->array, card_1,
                                               array2->array, card_2);
 #else
         return intersect_uint16_cardinality(array1->array, card_1,
                                             array2->array, card_2);
 #endif
     }
 }

 bool array_container_intersect(const array_container_t *array1,
                                   const array_container_t *array2) {
     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
     const int threshold = 64;  // subject to tuning
     if (card_1 * threshold < card_2) {
         return intersect_skewed_uint16_nonempty(
             array1->array, card_1, array2->array, card_2);
     } else if (card_2 * threshold < card_1) {
     	return intersect_skewed_uint16_nonempty(
             array2->array, card_2, array1->array, card_1);
     } else {
     	// we do not bother vectorizing
         return intersect_uint16_nonempty(array1->array, card_1,
                                             array2->array, card_2);
     }
 }

 /* computes the intersection of array1 and array2 and write the result to
  * array1.
  * */
 void array_container_intersection_inplace(array_container_t *src_1,
                                           const array_container_t *src_2) {
     // todo: can any of this be vectorized?
     int32_t card_1 = src_1->cardinality, card_2 = src_2->cardinality;
     const int threshold = 64;  // subject to tuning
     if (card_1 * threshold < card_2) {
         src_1->cardinality = intersect_skewed_uint16(
             src_1->array, card_1, src_2->array, card_2, src_1->array);
     } else if (card_2 * threshold < card_1) {
         src_1->cardinality = intersect_skewed_uint16(
             src_2->array, card_2, src_1->array, card_1, src_1->array);
     } else {
         src_1->cardinality = intersect_uint16(
             src_1->array, card_1, src_2->array, card_2, src_1->array);
     }
 }

 int array_container_to_uint32_array(void *vout, const array_container_t *cont,
                                     uint32_t base) {
     int outpos = 0;
     uint32_t *out = (uint32_t *)vout;
     for (int i = 0; i < cont->cardinality; ++i) {
         const uint32_t val = base + cont->array[i];
         memcpy(out + outpos, &val,
                sizeof(uint32_t));  // should be compiled as a MOV on x64
         outpos++;
     }
     return outpos;
 }

 void array_container_printf(const array_container_t *v) {
     if (v->cardinality == 0) {
         printf("{}");
         return;
     }
     printf("{");
     printf("%d", v->array[0]);
     for (int i = 1; i < v->cardinality; ++i) {
         printf(",%d", v->array[i]);
     }
     printf("}");
 }

 void array_container_printf_as_uint32_array(const array_container_t *v,
                                             uint32_t base) {
     if (v->cardinality == 0) {
         return;
     }
     printf("%u", v->array[0] + base);
     for (int i = 1; i < v->cardinality; ++i) {
         printf(",%u", v->array[i] + base);
     }
 }

 /* Compute the number of runs */
 int32_t array_container_number_of_runs(const array_container_t *a) {
     // Can SIMD work here?
     int32_t nr_runs = 0;
     int32_t prev = -2;
     for (const uint16_t *p = a->array; p != a->array + a->cardinality; ++p) {
         if (*p != prev + 1) nr_runs++;
         prev = *p;
     }
     return nr_runs;
 }

 int32_t array_container_serialize(const array_container_t *container, char *buf) {
     int32_t l, off;
     uint16_t cardinality = (uint16_t)container->cardinality;

     memcpy(buf, &cardinality, off = sizeof(cardinality));
     l = sizeof(uint16_t) * container->cardinality;
     if (l) memcpy(&buf[off], container->array, l);

     return (off + l);
 }

 /**
  * Writes the underlying array to buf, outputs how many bytes were written.
  * The number of bytes written should be
  * array_container_size_in_bytes(container).
  *
  */
 int32_t array_container_write(const array_container_t *container, char *buf) {
     memcpy(buf, container->array, container->cardinality * sizeof(uint16_t));
     return array_container_size_in_bytes(container);
 }

 bool array_container_is_subset(const array_container_t *container1,
                                const array_container_t *container2) {
     if (container1->cardinality > container2->cardinality) {
         return false;
     }
     int i1 = 0, i2 = 0;
     while (i1 < container1->cardinality && i2 < container2->cardinality) {
         if (container1->array[i1] == container2->array[i2]) {
             i1++;
             i2++;
         } else if (container1->array[i1] > container2->array[i2]) {
             i2++;
         } else {  // container1->array[i1] < container2->array[i2]
             return false;
         }
     }
     if (i1 == container1->cardinality) {
         return true;
     } else {
         return false;
     }
 }

 int32_t array_container_read(int32_t cardinality, array_container_t *container,
                              const char *buf) {
     if (container->capacity < cardinality) {
         array_container_grow(container, cardinality, false);
     }
     container->cardinality = cardinality;
     memcpy(container->array, buf, container->cardinality * sizeof(uint16_t));

     return array_container_size_in_bytes(container);
 }

 uint32_t array_container_serialization_len(const array_container_t *container) {
     return (sizeof(uint16_t) /* container->cardinality converted to 16 bit */ +
             (sizeof(uint16_t) * container->cardinality));
 }

 void *array_container_deserialize(const char *buf, size_t buf_len) {
     array_container_t *ptr;

     if (buf_len < 2) /* capacity converted to 16 bit */
         return (NULL);
     else
         buf_len -= 2;

     if ((ptr = (array_container_t *)malloc(sizeof(array_container_t))) !=
         NULL) {
         size_t len;
         int32_t off;
         uint16_t cardinality;

         memcpy(&cardinality, buf, off = sizeof(cardinality));

         ptr->capacity = ptr->cardinality = (uint32_t)cardinality;
         len = sizeof(uint16_t) * ptr->cardinality;

         if (len != buf_len) {
             free(ptr);
             return (NULL);
         }

         if ((ptr->array = (uint16_t *)malloc(sizeof(uint16_t) *
                                              ptr->capacity)) == NULL) {
             free(ptr);
             return (NULL);
         }

         if (len) memcpy(ptr->array, &buf[off], len);

         /* Check if returned values are monotonically increasing */
         for (int32_t i = 0, j = 0; i < ptr->cardinality; i++) {
             if (ptr->array[i] < j) {
                 free(ptr->array);
                 free(ptr);
                 return (NULL);
             } else
                 j = ptr->array[i];
         }
     }

     return (ptr);
 }

 bool array_container_iterate(const array_container_t *cont, uint32_t base,
                              roaring_iterator iterator, void *ptr) {
     for (int i = 0; i < cont->cardinality; i++)
         if (!iterator(cont->array[i] + base, ptr)) return false;
     return true;
 }

 bool array_container_iterate64(const array_container_t *cont, uint32_t base,
                                roaring_iterator64 iterator, uint64_t high_bits,
                                void *ptr) {
     for (int i = 0; i < cont->cardinality; i++)
         if (!iterator(high_bits | (uint64_t)(cont->array[i] + base), ptr))
             return false;
     return true;
 }
 /* end file src/containers/array.c */
 /* begin file src/containers/bitset.c */
 /*
  * bitset.c
  *
  */
 #ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 200809L
 #endif
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>


 void bitset_container_clear(bitset_container_t *bitset) {
     memset(bitset->array, 0, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     bitset->cardinality = 0;
 }

 void bitset_container_set_all(bitset_container_t *bitset) {
     memset(bitset->array, INT64_C(-1),
            sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     bitset->cardinality = (1 << 16);
 }


 /* Create a new bitset. Return NULL in case of failure. */
 bitset_container_t *bitset_container_create(void) {
     bitset_container_t *bitset =
         (bitset_container_t *)malloc(sizeof(bitset_container_t));

     if (!bitset) {
         return NULL;
     }
     // sizeof(__m256i) == 32
     bitset->array = (uint64_t *)roaring_bitmap_aligned_malloc(
         32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     if (!bitset->array) {
         free(bitset);
         return NULL;
     }
     bitset_container_clear(bitset);
     return bitset;
 }

 /* Copy one container into another. We assume that they are distinct. */
 void bitset_container_copy(const bitset_container_t *source,
                            bitset_container_t *dest) {
     dest->cardinality = source->cardinality;
     memcpy(dest->array, source->array,
            sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
 }

 void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
                                      uint32_t max, uint16_t step) {
     if (step == 0) return;   // refuse to crash
     if ((64 % step) == 0) {  // step divides 64
         uint64_t mask = 0;   // construct the repeated mask
         for (uint32_t value = (min % step); value < 64; value += step) {
             mask |= ((uint64_t)1 << value);
         }
         uint32_t firstword = min / 64;
         uint32_t endword = (max - 1) / 64;
         bitset->cardinality = (max - min + step - 1) / step;
         if (firstword == endword) {
             bitset->array[firstword] |=
                 mask & (((~UINT64_C(0)) << (min % 64)) &
                         ((~UINT64_C(0)) >> ((~max + 1) % 64)));
             return;
         }
         bitset->array[firstword] = mask & ((~UINT64_C(0)) << (min % 64));
         for (uint32_t i = firstword + 1; i < endword; i++)
             bitset->array[i] = mask;
         bitset->array[endword] = mask & ((~UINT64_C(0)) >> ((~max + 1) % 64));
     } else {
         for (uint32_t value = min; value < max; value += step) {
             bitset_container_add(bitset, value);
         }
     }
 }

 /* Free memory. */
 void bitset_container_free(bitset_container_t *bitset) {
     if(bitset->array != NULL) {// Jon Strabala reports that some tools complain otherwise
       roaring_bitmap_aligned_free(bitset->array);
       bitset->array = NULL; // pedantic
     }
     free(bitset);
 }

 /* duplicate container. */
 bitset_container_t *bitset_container_clone(const bitset_container_t *src) {
     bitset_container_t *bitset =
         (bitset_container_t *)malloc(sizeof(bitset_container_t));
     assert(bitset);

     // sizeof(__m256i) == 32
     bitset->array = (uint64_t *)roaring_bitmap_aligned_malloc(
         32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     assert(bitset->array);
     bitset->cardinality = src->cardinality;
     memcpy(bitset->array, src->array,
            sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     return bitset;
 }

 void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
                                 uint32_t end) {
     bitset_set_range(bitset->array, begin, end);
     bitset->cardinality =
         bitset_container_compute_cardinality(bitset);  // could be smarter
 }


 bool bitset_container_intersect(const bitset_container_t *src_1,
                                   const bitset_container_t *src_2) {
 	// could vectorize, but this is probably already quite fast in practice
     const uint64_t * __restrict__ array_1 = src_1->array;
     const uint64_t * __restrict__ array_2 = src_2->array;
 	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) {
         if((array_1[i] & array_2[i]) != 0) return true;
     }
     return false;
 }


 #ifdef USEAVX
 #ifndef WORDS_IN_AVX2_REG
 #define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
 #endif
 /* Get the number of bits set (force computation) */
 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
     return (int) avx2_harley_seal_popcount256(
         (const __m256i *)bitset->array,
         BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));
 }

 #elif defined(USENEON)
 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
     uint16x8_t n0 = vdupq_n_u16(0);
     uint16x8_t n1 = vdupq_n_u16(0);
     uint16x8_t n2 = vdupq_n_u16(0);
     uint16x8_t n3 = vdupq_n_u16(0);
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {
         uint64x2_t c0 = vld1q_u64(&bitset->array[i + 0]);
         n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));
         uint64x2_t c1 = vld1q_u64(&bitset->array[i + 2]);
         n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));
         uint64x2_t c2 = vld1q_u64(&bitset->array[i + 4]);
         n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));
         uint64x2_t c3 = vld1q_u64(&bitset->array[i + 6]);
         n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));
     }
     uint64x2_t n = vdupq_n_u64(0);
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));
     return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);
 }

 #else

 /* Get the number of bits set (force computation) */
 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
     const uint64_t *array = bitset->array;
     int32_t sum = 0;
     for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
         sum += hamming(array[i]);
         sum += hamming(array[i + 1]);
         sum += hamming(array[i + 2]);
         sum += hamming(array[i + 3]);
     }
     return sum;
 }

 #endif

 #ifdef USEAVX

 #define BITSET_CONTAINER_FN_REPEAT 8
 #ifndef WORDS_IN_AVX2_REG
 #define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
 #endif
 #define LOOP_SIZE                    \
     BITSET_CONTAINER_SIZE_IN_WORDS / \
         ((WORDS_IN_AVX2_REG)*BITSET_CONTAINER_FN_REPEAT)

 /* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
    result to bitsetout */
 // clang-format off
 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
 int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \
                                        const bitset_container_t *src_2, \
                                        bitset_container_t *dst) {       \
     const uint8_t * __restrict__ array_1 = (const uint8_t *)src_1->array; \
     const uint8_t * __restrict__ array_2 = (const uint8_t *)src_2->array; \
     /* not using the blocking optimization for some reason*/            \
     uint8_t *out = (uint8_t*)dst->array;                                \
     const int innerloop = 8;                                            \
     for (size_t i = 0;                                                  \
         i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG);       \
                                                          i+=innerloop) {\
         __m256i A1, A2, AO;                                             \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1));                  \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2));                  \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)out, AO);                        \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 32));             \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 32));             \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+32), AO);                   \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 64));             \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 64));             \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+64), AO);                   \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 96));             \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 96));             \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+96), AO);                   \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 128));            \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 128));            \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+128), AO);                  \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 160));            \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 160));            \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+160), AO);                  \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 192));            \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 192));            \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+192), AO);                  \
         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 224));            \
         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 224));            \
         AO = avx_intrinsic(A2, A1);                                     \
         _mm256_storeu_si256((__m256i *)(out+224), AO);                  \
         out+=256;                                                       \
         array_1 += 256;                                                 \
         array_2 += 256;                                                 \
     }                                                                   \
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                      \
     return dst->cardinality;                                            \
 }                                                                       \
 /* next, a version that updates cardinality*/                           \
 int bitset_container_##opname(const bitset_container_t *src_1,          \
                               const bitset_container_t *src_2,          \
                               bitset_container_t *dst) {                \
     const __m256i * __restrict__ array_1 = (const __m256i *) src_1->array; \
     const __m256i * __restrict__ array_2 = (const __m256i *) src_2->array; \
     __m256i *out = (__m256i *) dst->array;                              \
     dst->cardinality = (int32_t)avx2_harley_seal_popcount256andstore_##opname(array_2,\
     		array_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
     return dst->cardinality;                                            \
 }                                                                       \
 /* next, a version that just computes the cardinality*/                 \
 int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
                               const bitset_container_t *src_2) {        \
     const __m256i * __restrict__ data1 = (const __m256i *) src_1->array; \
     const __m256i * __restrict__ data2 = (const __m256i *) src_2->array; \
     return (int)avx2_harley_seal_popcount256_##opname(data2,                \
     		data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
 }

 #elif defined(USENEON)

 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
 int bitset_container_##opname(const bitset_container_t *src_1,                \
                               const bitset_container_t *src_2,                \
                               bitset_container_t *dst) {                      \
     const uint64_t * __restrict__ array_1 = src_1->array;                     \
     const uint64_t * __restrict__ array_2 = src_2->array;                     \
     uint64_t *out = dst->array;                                               \
     uint16x8_t n0 = vdupq_n_u16(0);                                           \
     uint16x8_t n1 = vdupq_n_u16(0);                                           \
     uint16x8_t n2 = vdupq_n_u16(0);                                           \
     uint16x8_t n3 = vdupq_n_u16(0);                                           \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
         uint64x2_t c0 = neon_intrinsic(vld1q_u64(&array_1[i + 0]),            \
                                        vld1q_u64(&array_2[i + 0]));           \
         n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));   \
         vst1q_u64(&out[i + 0], c0);                                           \
         uint64x2_t c1 = neon_intrinsic(vld1q_u64(&array_1[i + 2]),            \
                                        vld1q_u64(&array_2[i + 2]));           \
         n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));   \
         vst1q_u64(&out[i + 2], c1);                                           \
         uint64x2_t c2 = neon_intrinsic(vld1q_u64(&array_1[i + 4]),            \
                                        vld1q_u64(&array_2[i + 4]));           \
         n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));   \
         vst1q_u64(&out[i + 4], c2);                                           \
         uint64x2_t c3 = neon_intrinsic(vld1q_u64(&array_1[i + 6]),            \
                                        vld1q_u64(&array_2[i + 6]));           \
         n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));   \
         vst1q_u64(&out[i + 6], c3);                                           \
     }                                                                         \
     uint64x2_t n = vdupq_n_u64(0);                                            \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));                           \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));                           \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));                           \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));                           \
     dst->cardinality = vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);           \
     return dst->cardinality;                                                  \
 }                                                                             \
 int bitset_container_##opname##_nocard(const bitset_container_t *src_1,       \
                                        const bitset_container_t *src_2,       \
                                              bitset_container_t *dst) {       \
     const uint64_t * __restrict__ array_1 = src_1->array;                     \
     const uint64_t * __restrict__ array_2 = src_2->array;                     \
     uint64_t *out = dst->array;                                               \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
         vst1q_u64(&out[i + 0], neon_intrinsic(vld1q_u64(&array_1[i + 0]),     \
                                               vld1q_u64(&array_2[i + 0])));   \
         vst1q_u64(&out[i + 2], neon_intrinsic(vld1q_u64(&array_1[i + 2]),     \
                                               vld1q_u64(&array_2[i + 2])));   \
         vst1q_u64(&out[i + 4], neon_intrinsic(vld1q_u64(&array_1[i + 4]),     \
                                               vld1q_u64(&array_2[i + 4])));   \
         vst1q_u64(&out[i + 6], neon_intrinsic(vld1q_u64(&array_1[i + 6]),     \
                                               vld1q_u64(&array_2[i + 6])));   \
     }                                                                         \
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                            \
     return dst->cardinality;                                                  \
 }                                                                             \
 int bitset_container_##opname##_justcard(const bitset_container_t *src_1,     \
                                          const bitset_container_t *src_2) {   \
     const uint64_t * __restrict__ array_1 = src_1->array;                     \
     const uint64_t * __restrict__ array_2 = src_2->array;                     \
     uint16x8_t n0 = vdupq_n_u16(0);                                           \
     uint16x8_t n1 = vdupq_n_u16(0);                                           \
     uint16x8_t n2 = vdupq_n_u16(0);                                           \
     uint16x8_t n3 = vdupq_n_u16(0);                                           \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
         uint64x2_t c0 = neon_intrinsic(vld1q_u64(&array_1[i + 0]),            \
                                        vld1q_u64(&array_2[i + 0]));           \
         n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));   \
         uint64x2_t c1 = neon_intrinsic(vld1q_u64(&array_1[i + 2]),            \
                                        vld1q_u64(&array_2[i + 2]));           \
         n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));   \
         uint64x2_t c2 = neon_intrinsic(vld1q_u64(&array_1[i + 4]),            \
                                        vld1q_u64(&array_2[i + 4]));           \
         n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));   \
         uint64x2_t c3 = neon_intrinsic(vld1q_u64(&array_1[i + 6]),            \
                                        vld1q_u64(&array_2[i + 6]));           \
         n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));   \
     }                                                                         \
     uint64x2_t n = vdupq_n_u64(0);                                            \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));                           \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));                           \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));                           \
     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));                           \
     return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);                       \
 }

 #else /* not USEAVX  */

 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
 int bitset_container_##opname(const bitset_container_t *src_1,            \
                               const bitset_container_t *src_2,            \
                               bitset_container_t *dst) {                  \
     const uint64_t * __restrict__ array_1 = src_1->array;                 \
     const uint64_t * __restrict__ array_2 = src_2->array;                 \
     uint64_t *out = dst->array;                                           \
     int32_t sum = 0;                                                      \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
         const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]),         \
                        word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
         out[i] = word_1;                                                  \
         out[i + 1] = word_2;                                              \
         sum += hamming(word_1);                                    \
         sum += hamming(word_2);                                    \
     }                                                                     \
     dst->cardinality = sum;                                               \
     return dst->cardinality;                                              \
 }                                                                         \
 int bitset_container_##opname##_nocard(const bitset_container_t *src_1,   \
                                        const bitset_container_t *src_2,   \
                                        bitset_container_t *dst) {         \
     const uint64_t * __restrict__ array_1 = src_1->array;                 \
     const uint64_t * __restrict__ array_2 = src_2->array;                 \
     uint64_t *out = dst->array;                                           \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {         \
         out[i] = (array_1[i])opsymbol(array_2[i]);                        \
     }                                                                     \
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                        \
     return dst->cardinality;                                              \
 }                                                                         \
 int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
                               const bitset_container_t *src_2) {          \
     const uint64_t * __restrict__ array_1 = src_1->array;                 \
     const uint64_t * __restrict__ array_2 = src_2->array;                 \
     int32_t sum = 0;                                                      \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
         const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]),         \
                        word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
         sum += hamming(word_1);                                    \
         sum += hamming(word_2);                                    \
     }                                                                     \
     return sum;                                                           \
 }

 #endif

 // we duplicate the function because other containers use the "or" term, makes API more consistent
 BITSET_CONTAINER_FN(or,    |, _mm256_or_si256, vorrq_u64)
 BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64)

 // we duplicate the function because other containers use the "intersection" term, makes API more consistent
 BITSET_CONTAINER_FN(and,          &, _mm256_and_si256, vandq_u64)
 BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64)

 BITSET_CONTAINER_FN(xor,    ^,  _mm256_xor_si256,    veorq_u64)
 BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
 // clang-format On


 int bitset_container_to_uint32_array( void *vout, const bitset_container_t *cont, uint32_t base) {
 #ifdef USEAVX2FORDECODING
 	if(cont->cardinality >= 8192)// heuristic
 		return (int) bitset_extract_setbits_avx2(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,cont->cardinality,base);
 	else
 		return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
 #else
 	return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
 #endif
 }

 /*
  * Print this container using printf (useful for debugging).
  */
 void bitset_container_printf(const bitset_container_t * v) {
 	printf("{");
 	uint32_t base = 0;
 	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
 	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
 		uint64_t w = v->array[i];
 		while (w != 0) {
 			uint64_t t = w & (~w + 1);
 			int r = __builtin_ctzll(w);
 			if(iamfirst) {// predicted to be false
 				printf("%u",base + r);
 				iamfirst = false;
 			} else {
 				printf(",%u",base + r);
 			}
 			w ^= t;
 		}
 		base += 64;
 	}
 	printf("}");
 }


 /*
  * Print this container using printf as a comma-separated list of 32-bit integers starting at base.
  */
 void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint32_t base) {
 	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
 	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
 		uint64_t w = v->array[i];
 		while (w != 0) {
 			uint64_t t = w & (~w + 1);
 			int r = __builtin_ctzll(w);
 			if(iamfirst) {// predicted to be false
 				printf("%u", r + base);
 				iamfirst = false;
 			} else {
 				printf(",%u",r + base);
 			}
 			w ^= t;
 		}
 		base += 64;
 	}
 }


 // TODO: use the fast lower bound, also
 int bitset_container_number_of_runs(bitset_container_t *b) {
   int num_runs = 0;
   uint64_t next_word = b->array[0];

   for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) {
     uint64_t word = next_word;
     next_word = b->array[i+1];
     num_runs += hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
   }

   uint64_t word = next_word;
   num_runs += hamming((~word) & (word << 1));
   if((word & 0x8000000000000000ULL) != 0)
     num_runs++;
   return num_runs;
 }

 int32_t bitset_container_serialize(const bitset_container_t *container, char *buf) {
   int32_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
   memcpy(buf, container->array, l);
   return(l);
 }


 int32_t bitset_container_write(const bitset_container_t *container,
                                   char *buf) {
 	memcpy(buf, container->array, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
 	return bitset_container_size_in_bytes(container);
 }


 int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container,
 		const char *buf)  {
 	container->cardinality = cardinality;
 	memcpy(container->array, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
 	return bitset_container_size_in_bytes(container);
 }

 uint32_t bitset_container_serialization_len(void) {
   return(sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
 }

 void* bitset_container_deserialize(const char *buf, size_t buf_len) {
   bitset_container_t *ptr;
   size_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;

   if(l != buf_len)
     return(NULL);

   if((ptr = (bitset_container_t *)malloc(sizeof(bitset_container_t))) != NULL) {
     memcpy(ptr, buf, sizeof(bitset_container_t));
     // sizeof(__m256i) == 32
     ptr->array = (uint64_t *) roaring_bitmap_aligned_malloc(32, l);
     if (! ptr->array) {
         free(ptr);
         return NULL;
     }
     memcpy(ptr->array, buf, l);
     ptr->cardinality = bitset_container_compute_cardinality(ptr);
   }

   return((void*)ptr);
 }

 bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roaring_iterator iterator, void *ptr) {
   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
     uint64_t w = cont->array[i];
     while (w != 0) {
       uint64_t t = w & (~w + 1);
       int r = __builtin_ctzll(w);
       if(!iterator(r + base, ptr)) return false;
       w ^= t;
     }
     base += 64;
   }
   return true;
 }

 bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, roaring_iterator64 iterator, uint64_t high_bits, void *ptr) {
   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
     uint64_t w = cont->array[i];
     while (w != 0) {
       uint64_t t = w & (~w + 1);
       int r = __builtin_ctzll(w);
       if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false;
       w ^= t;
     }
     base += 64;
   }
   return true;
 }


 bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
 	if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
 		if(container1->cardinality != container2->cardinality) {
 			return false;
 		}
     if (container1->cardinality == INT32_C(0x10000)) {
         return true;
     }
 	}
 #ifdef USEAVX
   const __m256i *ptr1 = (const __m256i*)container1->array;
   const __m256i *ptr2 = (const __m256i*)container2->array;
   for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/32; i++) {
       __m256i r1 = _mm256_load_si256(ptr1+i);
       __m256i r2 = _mm256_load_si256(ptr2+i);
       int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
       if ((uint32_t)mask != UINT32_MAX) {
           return false;
       }
   }
 #else
   return memcmp(container1->array,
                 container2->array,
                 BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)) == 0;
 #endif
 	return true;
 }

 bool bitset_container_is_subset(const bitset_container_t *container1,
                           const bitset_container_t *container2) {
     if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
         if(container1->cardinality > container2->cardinality) {
             return false;
         }
     }
     for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
 		if((container1->array[i] & container2->array[i]) != container1->array[i]) {
 			return false;
 		}
 	}
 	return true;
 }

 bool bitset_container_select(const bitset_container_t *container, uint32_t *start_rank, uint32_t rank, uint32_t *element) {
     int card = bitset_container_cardinality(container);
     if(rank >= *start_rank + card) {
         *start_rank += card;
         return false;
     }
     const uint64_t *array = container->array;
     int32_t size;
     for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) {
         size = hamming(array[i]);
         if(rank <= *start_rank + size) {
             uint64_t w = container->array[i];
             uint16_t base = i*64;
             while (w != 0) {
                 uint64_t t = w & (~w + 1);
                 int r = __builtin_ctzll(w);
                 if(*start_rank == rank) {
                     *element = r+base;
                     return true;
                 }
                 w ^= t;
                 *start_rank += 1;
             }
         }
         else
             *start_rank += size;
     }
     assert(false);
     __builtin_unreachable();
 }


 /* Returns the smallest value (assumes not empty) */
 uint16_t bitset_container_minimum(const bitset_container_t *container) {
   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
     uint64_t w = container->array[i];
     if (w != 0) {
       int r = __builtin_ctzll(w);
       return r + i * 64;
     }
   }
   return UINT16_MAX;
 }

 /* Returns the largest value (assumes not empty) */
 uint16_t bitset_container_maximum(const bitset_container_t *container) {
   for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) {
     uint64_t w = container->array[i];
     if (w != 0) {
       int r = __builtin_clzll(w);
       return i * 64 + 63  - r;
     }
   }
   return 0;
 }

 /* Returns the number of values equal or smaller than x */
 int bitset_container_rank(const bitset_container_t *container, uint16_t x) {
   // credit: aqrit
   int sum = 0;
   int i = 0;
   for (int end = x / 64; i < end; i++){
     sum += hamming(container->array[i]);
   }
   uint64_t lastword = container->array[i];
   uint64_t lastpos = UINT64_C(1) << (x % 64);
   uint64_t mask = lastpos + lastpos - 1; // smear right
   sum += hamming(lastword & mask);
   return sum;
 }

 /* Returns the index of the first value equal or larger than x, or -1 */
 int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) {
   uint32_t x32 = x;
   uint32_t k = x32 / 64;
   uint64_t word = container->array[k];
   const int diff = x32 - k * 64; // in [0,64)
   word = (word >> diff) << diff; // a mask is faster, but we don't care
   while(word == 0) {
     k++;
     if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1;
     word = container->array[k];
   }
   return k * 64 + __builtin_ctzll(word);
 }
 /* end file src/containers/bitset.c */
 /* begin file src/containers/containers.c */


 void container_free(void *container, uint8_t typecode) {
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             bitset_container_free((bitset_container_t *)container);
             break;
         case ARRAY_CONTAINER_TYPE_CODE:
             array_container_free((array_container_t *)container);
             break;
         case RUN_CONTAINER_TYPE_CODE:
             run_container_free((run_container_t *)container);
             break;
         case SHARED_CONTAINER_TYPE_CODE:
             shared_container_free((shared_container_t *)container);
             break;
         default:
             assert(false);
             __builtin_unreachable();
     }
 }

 void container_printf(const void *container, uint8_t typecode) {
     container = container_unwrap_shared(container, &typecode);
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             bitset_container_printf((const bitset_container_t *)container);
             return;
         case ARRAY_CONTAINER_TYPE_CODE:
             array_container_printf((const array_container_t *)container);
             return;
         case RUN_CONTAINER_TYPE_CODE:
             run_container_printf((const run_container_t *)container);
             return;
         default:
             __builtin_unreachable();
     }
 }

 void container_printf_as_uint32_array(const void *container, uint8_t typecode,
                                       uint32_t base) {
     container = container_unwrap_shared(container, &typecode);
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             bitset_container_printf_as_uint32_array(
                 (const bitset_container_t *)container, base);
             return;
         case ARRAY_CONTAINER_TYPE_CODE:
             array_container_printf_as_uint32_array(
                 (const array_container_t *)container, base);
             return;
         case RUN_CONTAINER_TYPE_CODE:
             run_container_printf_as_uint32_array(
                 (const run_container_t *)container, base);
             return;
             return;
         default:
             __builtin_unreachable();
     }
 }

 int32_t container_serialize(const void *container, uint8_t typecode,
                             char *buf) {
     container = container_unwrap_shared(container, &typecode);
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             return (bitset_container_serialize((const bitset_container_t *)container,
                                                buf));
         case ARRAY_CONTAINER_TYPE_CODE:
             return (
                 array_container_serialize((const array_container_t *)container, buf));
         case RUN_CONTAINER_TYPE_CODE:
             return (run_container_serialize((const run_container_t *)container, buf));
         default:
             assert(0);
             __builtin_unreachable();
             return (-1);
     }
 }

 uint32_t container_serialization_len(const void *container, uint8_t typecode) {
     container = container_unwrap_shared(container, &typecode);
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             return bitset_container_serialization_len();
         case ARRAY_CONTAINER_TYPE_CODE:
             return array_container_serialization_len(
                 (const array_container_t *)container);
         case RUN_CONTAINER_TYPE_CODE:
             return run_container_serialization_len(
                 (const run_container_t *)container);
         default:
             assert(0);
             __builtin_unreachable();
             return (0);
     }
 }

 void *container_deserialize(uint8_t typecode, const char *buf, size_t buf_len) {
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             return (bitset_container_deserialize(buf, buf_len));
         case ARRAY_CONTAINER_TYPE_CODE:
             return (array_container_deserialize(buf, buf_len));
         case RUN_CONTAINER_TYPE_CODE:
             return (run_container_deserialize(buf, buf_len));
         case SHARED_CONTAINER_TYPE_CODE:
             printf("this should never happen.\n");
             assert(0);
             __builtin_unreachable();
             return (NULL);
         default:
             assert(0);
             __builtin_unreachable();
             return (NULL);
     }
 }

 void *get_copy_of_container(void *container, uint8_t *typecode,
                             bool copy_on_write) {
     if (copy_on_write) {
         shared_container_t *shared_container;
         if (*typecode == SHARED_CONTAINER_TYPE_CODE) {
             shared_container = (shared_container_t *)container;
             shared_container->counter += 1;
             return shared_container;
         }
         assert(*typecode != SHARED_CONTAINER_TYPE_CODE);

         if ((shared_container = (shared_container_t *)malloc(
                  sizeof(shared_container_t))) == NULL) {
             return NULL;
         }

         shared_container->container = container;
         shared_container->typecode = *typecode;

         shared_container->counter = 2;
         *typecode = SHARED_CONTAINER_TYPE_CODE;

         return shared_container;
     }  // copy_on_write
     // otherwise, no copy on write...
     const void *actualcontainer =
         container_unwrap_shared((const void *)container, typecode);
     assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
     return container_clone(actualcontainer, *typecode);
 }
 /**
  * Copies a container, requires a typecode. This allocates new memory, caller
  * is responsible for deallocation.
  */
 void *container_clone(const void *container, uint8_t typecode) {
     container = container_unwrap_shared(container, &typecode);
     switch (typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             return bitset_container_clone((const bitset_container_t *)container);
         case ARRAY_CONTAINER_TYPE_CODE:
             return array_container_clone((const array_container_t *)container);
         case RUN_CONTAINER_TYPE_CODE:
             return run_container_clone((const run_container_t *)container);
         case SHARED_CONTAINER_TYPE_CODE:
             printf("shared containers are not clonable\n");
             assert(false);
             return NULL;
         default:
             assert(false);
             __builtin_unreachable();
             return NULL;
     }
 }

 void *shared_container_extract_copy(shared_container_t *container,
                                     uint8_t *typecode) {
     assert(container->counter > 0);
     assert(container->typecode != SHARED_CONTAINER_TYPE_CODE);
     container->counter--;
     *typecode = container->typecode;
     void *answer;
     if (container->counter == 0) {
         answer = container->container;
         container->container = NULL;  // paranoid
         free(container);
     } else {
         answer = container_clone(container->container, *typecode);
     }
     assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
     return answer;
 }

 void shared_container_free(shared_container_t *container) {
     assert(container->counter > 0);
     container->counter--;
     if (container->counter == 0) {
         assert(container->typecode != SHARED_CONTAINER_TYPE_CODE);
         container_free(container->container, container->typecode);
         container->container = NULL;  // paranoid
         free(container);
     }
 }

 /* end file src/containers/containers.c */
 /* begin file src/containers/convert.c */
 #include <stdio.h>


 // file contains grubby stuff that must know impl. details of all container
 // types.
 bitset_container_t *bitset_container_from_array(const array_container_t *a) {
     bitset_container_t *ans = bitset_container_create();
     int limit = array_container_cardinality(a);
     for (int i = 0; i < limit; ++i) bitset_container_set(ans, a->array[i]);
     return ans;
 }

 bitset_container_t *bitset_container_from_run(const run_container_t *arr) {
     int card = run_container_cardinality(arr);
     bitset_container_t *answer = bitset_container_create();
     for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
         rle16_t vl = arr->runs[rlepos];
         bitset_set_lenrange(answer->array, vl.value, vl.length);
     }
     answer->cardinality = card;
     return answer;
 }

 array_container_t *array_container_from_run(const run_container_t *arr) {
     array_container_t *answer =
         array_container_create_given_capacity(run_container_cardinality(arr));
     answer->cardinality = 0;
     for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
         int run_start = arr->runs[rlepos].value;
         int run_end = run_start + arr->runs[rlepos].length;

         for (int run_value = run_start; run_value <= run_end; ++run_value) {
             answer->array[answer->cardinality++] = (uint16_t)run_value;
         }
     }
     return answer;
 }

 array_container_t *array_container_from_bitset(const bitset_container_t *bits) {
     array_container_t *result =
         array_container_create_given_capacity(bits->cardinality);
     result->cardinality = bits->cardinality;
     //  sse version ends up being slower here
     // (bitset_extract_setbits_sse_uint16)
     // because of the sparsity of the data
     bitset_extract_setbits_uint16(bits->array, BITSET_CONTAINER_SIZE_IN_WORDS,
                                   result->array, 0);
     return result;
 }

 /* assumes that container has adequate space.  Run from [s,e] (inclusive) */
 static void add_run(run_container_t *r, int s, int e) {
     r->runs[r->n_runs].value = s;
     r->runs[r->n_runs].length = e - s;
     r->n_runs++;
 }

 run_container_t *run_container_from_array(const array_container_t *c) {
     int32_t n_runs = array_container_number_of_runs(c);
     run_container_t *answer = run_container_create_given_capacity(n_runs);
     int prev = -2;
     int run_start = -1;
     int32_t card = c->cardinality;
     if (card == 0) return answer;
     for (int i = 0; i < card; ++i) {
         const uint16_t cur_val = c->array[i];
         if (cur_val != prev + 1) {
             // new run starts; flush old one, if any
             if (run_start != -1) add_run(answer, run_start, prev);
             run_start = cur_val;
         }
         prev = c->array[i];
     }
     // now prev is the last seen value
     add_run(answer, run_start, prev);
     // assert(run_container_cardinality(answer) == c->cardinality);
     return answer;
 }

 /**
  * Convert the runcontainer to either a Bitmap or an Array Container, depending
  * on the cardinality.  Frees the container.
  * Allocates and returns new container, which caller is responsible for freeing.
  * It does not free the run container.
  */

 void *convert_to_bitset_or_array_container(run_container_t *r, int32_t card,
                                            uint8_t *resulttype) {
     if (card <= DEFAULT_MAX_SIZE) {
         array_container_t *answer = array_container_create_given_capacity(card);
         answer->cardinality = 0;
         for (int rlepos = 0; rlepos < r->n_runs; ++rlepos) {
             uint16_t run_start = r->runs[rlepos].value;
             uint16_t run_end = run_start + r->runs[rlepos].length;
             for (uint16_t run_value = run_start; run_value <= run_end;
                  ++run_value) {
                 answer->array[answer->cardinality++] = run_value;
             }
         }
         assert(card == answer->cardinality);
         *resulttype = ARRAY_CONTAINER_TYPE_CODE;
         //run_container_free(r);
         return answer;
     }
     bitset_container_t *answer = bitset_container_create();
     for (int rlepos = 0; rlepos < r->n_runs; ++rlepos) {
         uint16_t run_start = r->runs[rlepos].value;
         bitset_set_lenrange(answer->array, run_start, r->runs[rlepos].length);
     }
     answer->cardinality = card;
     *resulttype = BITSET_CONTAINER_TYPE_CODE;
     //run_container_free(r);
     return answer;
 }

 /* Converts a run container to either an array or a bitset, IF it saves space.
  */
 /* If a conversion occurs, the caller is responsible to free the original
  * container and
  * he becomes responsible to free the new one. */
 void *convert_run_to_efficient_container(run_container_t *c,
                                          uint8_t *typecode_after) {
     int32_t size_as_run_container =
         run_container_serialized_size_in_bytes(c->n_runs);

     int32_t size_as_bitset_container =
         bitset_container_serialized_size_in_bytes();
     int32_t card = run_container_cardinality(c);
     int32_t size_as_array_container =
         array_container_serialized_size_in_bytes(card);

     int32_t min_size_non_run =
         size_as_bitset_container < size_as_array_container
             ? size_as_bitset_container
             : size_as_array_container;
     if (size_as_run_container <= min_size_non_run) {  // no conversion
         *typecode_after = RUN_CONTAINER_TYPE_CODE;
         return c;
     }
     if (card <= DEFAULT_MAX_SIZE) {
         // to array
         array_container_t *answer = array_container_create_given_capacity(card);
         answer->cardinality = 0;
         for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
             int run_start = c->runs[rlepos].value;
             int run_end = run_start + c->runs[rlepos].length;

             for (int run_value = run_start; run_value <= run_end; ++run_value) {
                 answer->array[answer->cardinality++] = (uint16_t)run_value;
             }
         }
         *typecode_after = ARRAY_CONTAINER_TYPE_CODE;
         return answer;
     }

     // else to bitset
     bitset_container_t *answer = bitset_container_create();

     for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
         int start = c->runs[rlepos].value;
         int end = start + c->runs[rlepos].length;
         bitset_set_range(answer->array, start, end + 1);
     }
     answer->cardinality = card;
     *typecode_after = BITSET_CONTAINER_TYPE_CODE;
     return answer;
 }

 // like convert_run_to_efficient_container but frees the old result if needed
 void *convert_run_to_efficient_container_and_free(run_container_t *c,
                                                   uint8_t *typecode_after) {
     void *answer = convert_run_to_efficient_container(c, typecode_after);
     if (answer != c) run_container_free(c);
     return answer;
 }

 /* once converted, the original container is disposed here, rather than
    in roaring_array
 */

 // TODO: split into run-  array-  and bitset-  subfunctions for sanity;
 // a few function calls won't really matter.

 void *convert_run_optimize(void *c, uint8_t typecode_original,
                            uint8_t *typecode_after) {
     if (typecode_original == RUN_CONTAINER_TYPE_CODE) {
         void *newc = convert_run_to_efficient_container((run_container_t *)c,
                                                         typecode_after);
         if (newc != c) {
             container_free(c, typecode_original);
         }
         return newc;
     } else if (typecode_original == ARRAY_CONTAINER_TYPE_CODE) {
         // it might need to be converted to a run container.
         array_container_t *c_qua_array = (array_container_t *)c;
         int32_t n_runs = array_container_number_of_runs(c_qua_array);
         int32_t size_as_run_container =
             run_container_serialized_size_in_bytes(n_runs);
         int32_t card = array_container_cardinality(c_qua_array);
         int32_t size_as_array_container =
             array_container_serialized_size_in_bytes(card);

         if (size_as_run_container >= size_as_array_container) {
             *typecode_after = ARRAY_CONTAINER_TYPE_CODE;
             return c;
         }
         // else convert array to run container
         run_container_t *answer = run_container_create_given_capacity(n_runs);
         int prev = -2;
         int run_start = -1;

         assert(card > 0);
         for (int i = 0; i < card; ++i) {
             uint16_t cur_val = c_qua_array->array[i];
             if (cur_val != prev + 1) {
                 // new run starts; flush old one, if any
                 if (run_start != -1) add_run(answer, run_start, prev);
                 run_start = cur_val;
             }
             prev = c_qua_array->array[i];
         }
         assert(run_start >= 0);
         // now prev is the last seen value
         add_run(answer, run_start, prev);
         *typecode_after = RUN_CONTAINER_TYPE_CODE;
         array_container_free(c_qua_array);
         return answer;
     } else if (typecode_original ==
                BITSET_CONTAINER_TYPE_CODE) {  // run conversions on bitset
         // does bitset need conversion to run?
         bitset_container_t *c_qua_bitset = (bitset_container_t *)c;
         int32_t n_runs = bitset_container_number_of_runs(c_qua_bitset);
         int32_t size_as_run_container =
             run_container_serialized_size_in_bytes(n_runs);
         int32_t size_as_bitset_container =
             bitset_container_serialized_size_in_bytes();

         if (size_as_bitset_container <= size_as_run_container) {
             // no conversion needed.
             *typecode_after = BITSET_CONTAINER_TYPE_CODE;
             return c;
         }
         // bitset to runcontainer (ported from Java  RunContainer(
         // BitmapContainer bc, int nbrRuns))
         assert(n_runs > 0);  // no empty bitmaps
         run_container_t *answer = run_container_create_given_capacity(n_runs);

         int long_ctr = 0;
         uint64_t cur_word = c_qua_bitset->array[0];
         int run_count = 0;
         while (true) {
             while (cur_word == UINT64_C(0) &&
                    long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
                 cur_word = c_qua_bitset->array[++long_ctr];

             if (cur_word == UINT64_C(0)) {
                 bitset_container_free(c_qua_bitset);
                 *typecode_after = RUN_CONTAINER_TYPE_CODE;
                 return answer;
             }

             int local_run_start = __builtin_ctzll(cur_word);
             int run_start = local_run_start + 64 * long_ctr;
             uint64_t cur_word_with_1s = cur_word | (cur_word - 1);

             int run_end = 0;
             while (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF) &&
                    long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
                 cur_word_with_1s = c_qua_bitset->array[++long_ctr];

             if (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF)) {
                 run_end = 64 + long_ctr * 64;  // exclusive, I guess
                 add_run(answer, run_start, run_end - 1);
                 bitset_container_free(c_qua_bitset);
                 *typecode_after = RUN_CONTAINER_TYPE_CODE;
                 return answer;
             }
             int local_run_end = __builtin_ctzll(~cur_word_with_1s);
             run_end = local_run_end + long_ctr * 64;
             add_run(answer, run_start, run_end - 1);
             run_count++;
             cur_word = cur_word_with_1s & (cur_word_with_1s + 1);
         }
         return answer;
     } else {
         assert(false);
         __builtin_unreachable();
         return NULL;
     }
 }

 bitset_container_t *bitset_container_from_run_range(const run_container_t *run,
                                                     uint32_t min, uint32_t max) {
     bitset_container_t *bitset = bitset_container_create();
     int32_t union_cardinality = 0;
     for (int32_t i = 0; i < run->n_runs; ++i) {
         uint32_t rle_min = run->runs[i].value;
         uint32_t rle_max = rle_min + run->runs[i].length;
         bitset_set_lenrange(bitset->array, rle_min, rle_max - rle_min);
         union_cardinality += run->runs[i].length + 1;
     }
     union_cardinality += max - min + 1;
     union_cardinality -= bitset_lenrange_cardinality(bitset->array, min, max-min);
     bitset_set_lenrange(bitset->array, min, max - min);
     bitset->cardinality = union_cardinality;
     return bitset;
 }
 /* end file src/containers/convert.c */
 /* begin file src/containers/mixed_andnot.c */
 /*
  * mixed_andnot.c.  More methods since operation is not symmetric,
  * except no "wide" andnot , so no lazy options motivated.
  */

 #include <assert.h>
 #include <string.h>


 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst, a valid array container that could be the same as dst.*/
 void array_bitset_container_andnot(const array_container_t *src_1,
                                    const bitset_container_t *src_2,
                                    array_container_t *dst) {
     // follows Java implementation as of June 2016
     if (dst->capacity < src_1->cardinality) {
         array_container_grow(dst, src_1->cardinality, false);
     }
     int32_t newcard = 0;
     const int32_t origcard = src_1->cardinality;
     for (int i = 0; i < origcard; ++i) {
         uint16_t key = src_1->array[i];
         dst->array[newcard] = key;
         newcard += 1 - bitset_container_contains(src_2, key);
     }
     dst->cardinality = newcard;
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * src_1 */

 void array_bitset_container_iandnot(array_container_t *src_1,
                                     const bitset_container_t *src_2) {
     array_bitset_container_andnot(src_1, src_2, src_1);
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst, which does not initially have a valid container.
  * Return true for a bitset result; false for array
  */

 bool bitset_array_container_andnot(const bitset_container_t *src_1,
                                    const array_container_t *src_2, void **dst) {
     // Java did this directly, but we have option of asm or avx
     bitset_container_t *result = bitset_container_create();
     bitset_container_copy(src_1, result);
     result->cardinality =
         (int32_t)bitset_clear_list(result->array, (uint64_t)result->cardinality,
                                    src_2->array, (uint64_t)src_2->cardinality);

     // do required type conversions.
     if (result->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(result);
         bitset_container_free(result);
         return false;
     }
     *dst = result;
     return true;
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst (which has no container initially).  It will modify src_1
  * to be dst if the result is a bitset.  Otherwise, it will
  * free src_1 and dst will be a new array container.  In both
  * cases, the caller is responsible for deallocating dst.
  * Returns true iff dst is a bitset  */

 bool bitset_array_container_iandnot(bitset_container_t *src_1,
                                     const array_container_t *src_2,
                                     void **dst) {
     *dst = src_1;
     src_1->cardinality =
         (int32_t)bitset_clear_list(src_1->array, (uint64_t)src_1->cardinality,
                                    src_2->array, (uint64_t)src_2->cardinality);

     if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(src_1);
         bitset_container_free(src_1);
         return false;  // not bitset
     } else
         return true;
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst. Result may be either a bitset or an array container
  * (returns "result is bitset"). dst does not initially have
  * any container, but becomes either a bitset container (return
  * result true) or an array container.
  */

 bool run_bitset_container_andnot(const run_container_t *src_1,
                                  const bitset_container_t *src_2, void **dst) {
     // follows the Java implementation as of June 2016
     int card = run_container_cardinality(src_1);
     if (card <= DEFAULT_MAX_SIZE) {
         // must be an array
         array_container_t *answer = array_container_create_given_capacity(card);
         answer->cardinality = 0;
         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
             rle16_t rle = src_1->runs[rlepos];
             for (int run_value = rle.value; run_value <= rle.value + rle.length;
                  ++run_value) {
                 if (!bitset_container_get(src_2, (uint16_t)run_value)) {
                     answer->array[answer->cardinality++] = (uint16_t)run_value;
                 }
             }
         }
         *dst = answer;
         return false;
     } else {  // we guess it will be a bitset, though have to check guess when
               // done
         bitset_container_t *answer = bitset_container_clone(src_2);

         uint32_t last_pos = 0;
         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
             rle16_t rle = src_1->runs[rlepos];

             uint32_t start = rle.value;
             uint32_t end = start + rle.length + 1;
             bitset_reset_range(answer->array, last_pos, start);
             bitset_flip_range(answer->array, start, end);
             last_pos = end;
         }
         bitset_reset_range(answer->array, last_pos, (uint32_t)(1 << 16));

         answer->cardinality = bitset_container_compute_cardinality(answer);

         if (answer->cardinality <= DEFAULT_MAX_SIZE) {
             *dst = array_container_from_bitset(answer);
             bitset_container_free(answer);
             return false;  // not bitset
         }
         *dst = answer;
         return true;  // bitset
     }
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst. Result may be either a bitset or an array container
  * (returns "result is bitset"). dst does not initially have
  * any container, but becomes either a bitset container (return
  * result true) or an array container.
  */

 bool run_bitset_container_iandnot(run_container_t *src_1,
                                   const bitset_container_t *src_2, void **dst) {
     // dummy implementation
     bool ans = run_bitset_container_andnot(src_1, src_2, dst);
     run_container_free(src_1);
     return ans;
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst. Result may be either a bitset or an array container
  * (returns "result is bitset").  dst does not initially have
  * any container, but becomes either a bitset container (return
  * result true) or an array container.
  */

 bool bitset_run_container_andnot(const bitset_container_t *src_1,
                                  const run_container_t *src_2, void **dst) {
     // follows Java implementation
     bitset_container_t *result = bitset_container_create();

     bitset_container_copy(src_1, result);
     for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
         rle16_t rle = src_2->runs[rlepos];
         bitset_reset_range(result->array, rle.value,
                            rle.value + rle.length + UINT32_C(1));
     }
     result->cardinality = bitset_container_compute_cardinality(result);

     if (result->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(result);
         bitset_container_free(result);
         return false;  // not bitset
     }
     *dst = result;
     return true;  // bitset
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst (which has no container initially).  It will modify src_1
  * to be dst if the result is a bitset.  Otherwise, it will
  * free src_1 and dst will be a new array container.  In both
  * cases, the caller is responsible for deallocating dst.
  * Returns true iff dst is a bitset  */

 bool bitset_run_container_iandnot(bitset_container_t *src_1,
                                   const run_container_t *src_2, void **dst) {
     *dst = src_1;

     for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
         rle16_t rle = src_2->runs[rlepos];
         bitset_reset_range(src_1->array, rle.value,
                            rle.value + rle.length + UINT32_C(1));
     }
     src_1->cardinality = bitset_container_compute_cardinality(src_1);

     if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(src_1);
         bitset_container_free(src_1);
         return false;  // not bitset
     } else
         return true;
 }

 /* helper. a_out must be a valid array container with adequate capacity.
  * Returns the cardinality of the output container. Partly Based on Java
  * implementation Util.unsignedDifference.
  *
  * TODO: Util.unsignedDifference does not use advanceUntil.  Is it cheaper
  * to avoid advanceUntil?
  */

 static int run_array_array_subtract(const run_container_t *r,
                                     const array_container_t *a_in,
                                     array_container_t *a_out) {
     int out_card = 0;
     int32_t in_array_pos =
         -1;  // since advanceUntil always assumes we start the search AFTER this

     for (int rlepos = 0; rlepos < r->n_runs; rlepos++) {
         int32_t start = r->runs[rlepos].value;
         int32_t end = start + r->runs[rlepos].length + 1;

         in_array_pos = advanceUntil(a_in->array, in_array_pos,
                                     a_in->cardinality, (uint16_t)start);

         if (in_array_pos >= a_in->cardinality) {  // run has no items subtracted
             for (int32_t i = start; i < end; ++i)
                 a_out->array[out_card++] = (uint16_t)i;
         } else {
             uint16_t next_nonincluded = a_in->array[in_array_pos];
             if (next_nonincluded >= end) {
                 // another case when run goes unaltered
                 for (int32_t i = start; i < end; ++i)
                     a_out->array[out_card++] = (uint16_t)i;
                 in_array_pos--;  // ensure we see this item again if necessary
             } else {
                 for (int32_t i = start; i < end; ++i)
                     if (i != next_nonincluded)
                         a_out->array[out_card++] = (uint16_t)i;
                     else  // 0 should ensure  we don't match
                         next_nonincluded =
                             (in_array_pos + 1 >= a_in->cardinality)
                                 ? 0
                                 : a_in->array[++in_array_pos];
                 in_array_pos--;  // see again
             }
         }
     }
     return out_card;
 }

 /* dst does not indicate a valid container initially.  Eventually it
  * can become any type of container.
  */

 int run_array_container_andnot(const run_container_t *src_1,
                                const array_container_t *src_2, void **dst) {
     // follows the Java impl as of June 2016

     int card = run_container_cardinality(src_1);
     const int arbitrary_threshold = 32;

     if (card <= arbitrary_threshold) {
         if (src_2->cardinality == 0) {
             *dst = run_container_clone(src_1);
             return RUN_CONTAINER_TYPE_CODE;
         }
         // Java's "lazyandNot.toEfficientContainer" thing
         run_container_t *answer = run_container_create_given_capacity(
             card + array_container_cardinality(src_2));

         int rlepos = 0;
         int xrlepos = 0;  // "x" is src_2
         rle16_t rle = src_1->runs[rlepos];
         int32_t start = rle.value;
         int32_t end = start + rle.length + 1;
         int32_t xstart = src_2->array[xrlepos];

         while ((rlepos < src_1->n_runs) && (xrlepos < src_2->cardinality)) {
             if (end <= xstart) {
                 // output the first run
                 answer->runs[answer->n_runs++] =
                     (rle16_t){.value = (uint16_t)start,
                               .length = (uint16_t)(end - start - 1)};
                 rlepos++;
                 if (rlepos < src_1->n_runs) {
                     start = src_1->runs[rlepos].value;
                     end = start + src_1->runs[rlepos].length + 1;
                 }
             } else if (xstart + 1 <= start) {
                 // exit the second run
                 xrlepos++;
                 if (xrlepos < src_2->cardinality) {
                     xstart = src_2->array[xrlepos];
                 }
             } else {
                 if (start < xstart) {
                     answer->runs[answer->n_runs++] =
                         (rle16_t){.value = (uint16_t)start,
                                   .length = (uint16_t)(xstart - start - 1)};
                 }
                 if (xstart + 1 < end) {
                     start = xstart + 1;
                 } else {
                     rlepos++;
                     if (rlepos < src_1->n_runs) {
                         start = src_1->runs[rlepos].value;
                         end = start + src_1->runs[rlepos].length + 1;
                     }
                 }
             }
         }
         if (rlepos < src_1->n_runs) {
             answer->runs[answer->n_runs++] =
                 (rle16_t){.value = (uint16_t)start,
                           .length = (uint16_t)(end - start - 1)};
             rlepos++;
             if (rlepos < src_1->n_runs) {
                 memcpy(answer->runs + answer->n_runs, src_1->runs + rlepos,
                        (src_1->n_runs - rlepos) * sizeof(rle16_t));
                 answer->n_runs += (src_1->n_runs - rlepos);
             }
         }
         uint8_t return_type;
         *dst = convert_run_to_efficient_container(answer, &return_type);
         if (answer != *dst) run_container_free(answer);
         return return_type;
     }
     // else it's a bitmap or array

     if (card <= DEFAULT_MAX_SIZE) {
         array_container_t *ac = array_container_create_given_capacity(card);
         // nb Java code used a generic iterator-based merge to compute
         // difference
         ac->cardinality = run_array_array_subtract(src_1, src_2, ac);
         *dst = ac;
         return ARRAY_CONTAINER_TYPE_CODE;
     }
     bitset_container_t *ans = bitset_container_from_run(src_1);
     bool result_is_bitset = bitset_array_container_iandnot(ans, src_2, dst);
     return (result_is_bitset ? BITSET_CONTAINER_TYPE_CODE
                              : ARRAY_CONTAINER_TYPE_CODE);
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst (which has no container initially).  It will modify src_1
  * to be dst if the result is a bitset.  Otherwise, it will
  * free src_1 and dst will be a new array container.  In both
  * cases, the caller is responsible for deallocating dst.
  * Returns true iff dst is a bitset  */

 int run_array_container_iandnot(run_container_t *src_1,
                                 const array_container_t *src_2, void **dst) {
     // dummy implementation same as June 2016 Java
     int ans = run_array_container_andnot(src_1, src_2, dst);
     run_container_free(src_1);
     return ans;
 }

 /* dst must be a valid array container, allowed to be src_1 */

 void array_run_container_andnot(const array_container_t *src_1,
                                 const run_container_t *src_2,
                                 array_container_t *dst) {
     // basically following Java impl as of June 2016
     if (src_1->cardinality > dst->capacity) {
         array_container_grow(dst, src_1->cardinality, false);
     }

     if (src_2->n_runs == 0) {
         memmove(dst->array, src_1->array,
                 sizeof(uint16_t) * src_1->cardinality);
         dst->cardinality = src_1->cardinality;
         return;
     }
     int32_t run_start = src_2->runs[0].value;
     int32_t run_end = run_start + src_2->runs[0].length;
     int which_run = 0;

     uint16_t val = 0;
     int dest_card = 0;
     for (int i = 0; i < src_1->cardinality; ++i) {
         val = src_1->array[i];
         if (val < run_start)
             dst->array[dest_card++] = val;
         else if (val <= run_end) {
             ;  // omitted item
         } else {
             do {
                 if (which_run + 1 < src_2->n_runs) {
                     ++which_run;
                     run_start = src_2->runs[which_run].value;
                     run_end = run_start + src_2->runs[which_run].length;

                 } else
                     run_start = run_end = (1 << 16) + 1;
             } while (val > run_end);
             --i;
         }
     }
     dst->cardinality = dest_card;
 }

 /* dst does not indicate a valid container initially.  Eventually it
  * can become any kind of container.
  */

 void array_run_container_iandnot(array_container_t *src_1,
                                  const run_container_t *src_2) {
     array_run_container_andnot(src_1, src_2, src_1);
 }

 /* dst does not indicate a valid container initially.  Eventually it
  * can become any kind of container.
  */

 int run_run_container_andnot(const run_container_t *src_1,
                              const run_container_t *src_2, void **dst) {
     run_container_t *ans = run_container_create();
     run_container_andnot(src_1, src_2, ans);
     uint8_t typecode_after;
     *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
     return typecode_after;
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst (which has no container initially).  It will modify src_1
  * to be dst if the result is a bitset.  Otherwise, it will
  * free src_1 and dst will be a new array container.  In both
  * cases, the caller is responsible for deallocating dst.
  * Returns true iff dst is a bitset  */

 int run_run_container_iandnot(run_container_t *src_1,
                               const run_container_t *src_2, void **dst) {
     // following Java impl as of June 2016 (dummy)
     int ans = run_run_container_andnot(src_1, src_2, dst);
     run_container_free(src_1);
     return ans;
 }

 /*
  * dst is a valid array container and may be the same as src_1
  */

 void array_array_container_andnot(const array_container_t *src_1,
                                   const array_container_t *src_2,
                                   array_container_t *dst) {
     array_container_andnot(src_1, src_2, dst);
 }

 /* inplace array-array andnot will always be able to reuse the space of
  * src_1 */
 void array_array_container_iandnot(array_container_t *src_1,
                                    const array_container_t *src_2) {
     array_container_andnot(src_1, src_2, src_1);
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst (which has no container initially). Return value is
  * "dst is a bitset"
  */

 bool bitset_bitset_container_andnot(const bitset_container_t *src_1,
                                     const bitset_container_t *src_2,
                                     void **dst) {
     bitset_container_t *ans = bitset_container_create();
     int card = bitset_container_andnot(src_1, src_2, ans);
     if (card <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(ans);
         bitset_container_free(ans);
         return false;  // not bitset
     } else {
         *dst = ans;
         return true;
     }
 }

 /* Compute the andnot of src_1 and src_2 and write the result to
  * dst (which has no container initially).  It will modify src_1
  * to be dst if the result is a bitset.  Otherwise, it will
  * free src_1 and dst will be a new array container.  In both
  * cases, the caller is responsible for deallocating dst.
  * Returns true iff dst is a bitset  */

 bool bitset_bitset_container_iandnot(bitset_container_t *src_1,
                                      const bitset_container_t *src_2,
                                      void **dst) {
     int card = bitset_container_andnot(src_1, src_2, src_1);
     if (card <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(src_1);
         bitset_container_free(src_1);
         return false;  // not bitset
     } else {
         *dst = src_1;
         return true;
     }
 }
 /* end file src/containers/mixed_andnot.c */
 /* begin file src/containers/mixed_equal.c */

 bool array_container_equal_bitset(const array_container_t* container1,
                                   const bitset_container_t* container2) {
     if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
         if (container2->cardinality != container1->cardinality) {
             return false;
         }
     }
     int32_t pos = 0;
     for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
         uint64_t w = container2->array[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1);
             uint16_t r = i * 64 + __builtin_ctzll(w);
             if (pos >= container1->cardinality) {
                 return false;
             }
             if (container1->array[pos] != r) {
                 return false;
             }
             ++pos;
             w ^= t;
         }
     }
     return (pos == container1->cardinality);
 }

 bool run_container_equals_array(const run_container_t* container1,
                                 const array_container_t* container2) {
     if (run_container_cardinality(container1) != container2->cardinality)
         return false;
     int32_t pos = 0;
     for (int i = 0; i < container1->n_runs; ++i) {
         const uint32_t run_start = container1->runs[i].value;
         const uint32_t le = container1->runs[i].length;

         if (container2->array[pos] != run_start) {
             return false;
         }

         if (container2->array[pos + le] != run_start + le) {
             return false;
         }

         pos += le + 1;
     }
     return true;
 }

 bool run_container_equals_bitset(const run_container_t* container1,
                                  const bitset_container_t* container2) {

     int run_card = run_container_cardinality(container1);
     int bitset_card = (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) ?
                       container2->cardinality :
                       bitset_container_compute_cardinality(container2);
     if (bitset_card != run_card) {
         return false;
     }

     for (int32_t i = 0; i < container1->n_runs; i++) {
         uint32_t begin = container1->runs[i].value;
         if (container1->runs[i].length) {
             uint32_t end = begin + container1->runs[i].length + 1;
             if (!bitset_container_contains_range(container2, begin, end)) {
                 return false;
             }
         } else {
             if (!bitset_container_contains(container2, begin)) {
                 return false;
             }
         }
     }

     return true;
 }
 /* end file src/containers/mixed_equal.c */
 /* begin file src/containers/mixed_intersection.c */
 /*
  * mixed_intersection.c
  *
  */


 /* Compute the intersection of src_1 and src_2 and write the result to
  * dst.  */
 void array_bitset_container_intersection(const array_container_t *src_1,
                                          const bitset_container_t *src_2,
                                          array_container_t *dst) {
     if (dst->capacity < src_1->cardinality) {
         array_container_grow(dst, src_1->cardinality, false);
     }
     int32_t newcard = 0;  // dst could be src_1
     const int32_t origcard = src_1->cardinality;
     for (int i = 0; i < origcard; ++i) {
         uint16_t key = src_1->array[i];
         // this branchless approach is much faster...
         dst->array[newcard] = key;
         newcard += bitset_container_contains(src_2, key);
         /**
          * we could do it this way instead...
          * if (bitset_container_contains(src_2, key)) {
          * dst->array[newcard++] = key;
          * }
          * but if the result is unpredictable, the processor generates
          * many mispredicted branches.
          * Difference can be huge (from 3 cycles when predictable all the way
          * to 16 cycles when unpredictable.
          * See
          * https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/bitset/c/arraybitsetintersection.c
          */
     }
     dst->cardinality = newcard;
 }

 /* Compute the size of the intersection of src_1 and src_2. */
 int array_bitset_container_intersection_cardinality(
     const array_container_t *src_1, const bitset_container_t *src_2) {
     int32_t newcard = 0;
     const int32_t origcard = src_1->cardinality;
     for (int i = 0; i < origcard; ++i) {
         uint16_t key = src_1->array[i];
         newcard += bitset_container_contains(src_2, key);
     }
     return newcard;
 }


 bool array_bitset_container_intersect(const array_container_t *src_1,
                                          const bitset_container_t *src_2) {
 	const int32_t origcard = src_1->cardinality;
 	for (int i = 0; i < origcard; ++i) {
 	        uint16_t key = src_1->array[i];
 	        if(bitset_container_contains(src_2, key)) return true;
 	}
 	return false;
 }

 /* Compute the intersection of src_1 and src_2 and write the result to
  * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
  * valid container. */
 void array_run_container_intersection(const array_container_t *src_1,
                                       const run_container_t *src_2,
                                       array_container_t *dst) {
     if (run_container_is_full(src_2)) {
         if (dst != src_1) array_container_copy(src_1, dst);
         return;
     }
     if (dst->capacity < src_1->cardinality) {
         array_container_grow(dst, src_1->cardinality, false);
     }
     if (src_2->n_runs == 0) {
         return;
     }
     int32_t rlepos = 0;
     int32_t arraypos = 0;
     rle16_t rle = src_2->runs[rlepos];
     int32_t newcard = 0;
     while (arraypos < src_1->cardinality) {
         const uint16_t arrayval = src_1->array[arraypos];
         while (rle.value + rle.length <
                arrayval) {  // this will frequently be false
             ++rlepos;
             if (rlepos == src_2->n_runs) {
                 dst->cardinality = newcard;
                 return;  // we are done
             }
             rle = src_2->runs[rlepos];
         }
         if (rle.value > arrayval) {
             arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
                                     rle.value);
         } else {
             dst->array[newcard] = arrayval;
             newcard++;
             arraypos++;
         }
     }
     dst->cardinality = newcard;
 }

 /* Compute the intersection of src_1 and src_2 and write the result to
  * *dst. If the result is true then the result is a bitset_container_t
  * otherwise is a array_container_t. If *dst ==  src_2, an in-place processing
  * is attempted.*/
 bool run_bitset_container_intersection(const run_container_t *src_1,
                                        const bitset_container_t *src_2,
                                        void **dst) {
     if (run_container_is_full(src_1)) {
         if (*dst != src_2) *dst = bitset_container_clone(src_2);
         return true;
     }
     int32_t card = run_container_cardinality(src_1);
     if (card <= DEFAULT_MAX_SIZE) {
         // result can only be an array (assuming that we never make a
         // RunContainer)
         if (card > src_2->cardinality) {
             card = src_2->cardinality;
         }
         array_container_t *answer = array_container_create_given_capacity(card);
         *dst = answer;
         if (*dst == NULL) {
             return false;
         }
         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
             rle16_t rle = src_1->runs[rlepos];
             uint32_t endofrun = (uint32_t)rle.value + rle.length;
             for (uint32_t runValue = rle.value; runValue <= endofrun;
                  ++runValue) {
                 answer->array[answer->cardinality] = (uint16_t)runValue;
                 answer->cardinality +=
                     bitset_container_contains(src_2, runValue);
             }
         }
         return false;
     }
     if (*dst == src_2) {  // we attempt in-place
         bitset_container_t *answer = (bitset_container_t *)*dst;
         uint32_t start = 0;
         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
             const rle16_t rle = src_1->runs[rlepos];
             uint32_t end = rle.value;
             bitset_reset_range(src_2->array, start, end);

             start = end + rle.length + 1;
         }
         bitset_reset_range(src_2->array, start, UINT32_C(1) << 16);
         answer->cardinality = bitset_container_compute_cardinality(answer);
         if (src_2->cardinality > DEFAULT_MAX_SIZE) {
             return true;
         } else {
             array_container_t *newanswer = array_container_from_bitset(src_2);
             if (newanswer == NULL) {
                 *dst = NULL;
                 return false;
             }
             *dst = newanswer;
             return false;
         }
     } else {  // no inplace
         // we expect the answer to be a bitmap (if we are lucky)
         bitset_container_t *answer = bitset_container_clone(src_2);

         *dst = answer;
         if (answer == NULL) {
             return true;
         }
         uint32_t start = 0;
         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
             const rle16_t rle = src_1->runs[rlepos];
             uint32_t end = rle.value;
             bitset_reset_range(answer->array, start, end);
             start = end + rle.length + 1;
         }
         bitset_reset_range(answer->array, start, UINT32_C(1) << 16);
         answer->cardinality = bitset_container_compute_cardinality(answer);

         if (answer->cardinality > DEFAULT_MAX_SIZE) {
             return true;
         } else {
             array_container_t *newanswer = array_container_from_bitset(answer);
             bitset_container_free((bitset_container_t *)*dst);
             if (newanswer == NULL) {
                 *dst = NULL;
                 return false;
             }
             *dst = newanswer;
             return false;
         }
     }
 }

 /* Compute the size of the intersection between src_1 and src_2 . */
 int array_run_container_intersection_cardinality(const array_container_t *src_1,
                                                  const run_container_t *src_2) {
     if (run_container_is_full(src_2)) {
         return src_1->cardinality;
     }
     if (src_2->n_runs == 0) {
         return 0;
     }
     int32_t rlepos = 0;
     int32_t arraypos = 0;
     rle16_t rle = src_2->runs[rlepos];
     int32_t newcard = 0;
     while (arraypos < src_1->cardinality) {
         const uint16_t arrayval = src_1->array[arraypos];
         while (rle.value + rle.length <
                arrayval) {  // this will frequently be false
             ++rlepos;
             if (rlepos == src_2->n_runs) {
                 return newcard;  // we are done
             }
             rle = src_2->runs[rlepos];
         }
         if (rle.value > arrayval) {
             arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
                                     rle.value);
         } else {
             newcard++;
             arraypos++;
         }
     }
     return newcard;
 }

 /* Compute the intersection  between src_1 and src_2
  **/
 int run_bitset_container_intersection_cardinality(
     const run_container_t *src_1, const bitset_container_t *src_2) {
     if (run_container_is_full(src_1)) {
         return bitset_container_cardinality(src_2);
     }
     int answer = 0;
     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
         rle16_t rle = src_1->runs[rlepos];
         answer +=
             bitset_lenrange_cardinality(src_2->array, rle.value, rle.length);
     }
     return answer;
 }


 bool array_run_container_intersect(const array_container_t *src_1,
                                       const run_container_t *src_2) {
 	if( run_container_is_full(src_2) ) {
 	    return !array_container_empty(src_1);
 	}
 	if (src_2->n_runs == 0) {
         return false;
     }
     int32_t rlepos = 0;
     int32_t arraypos = 0;
     rle16_t rle = src_2->runs[rlepos];
     while (arraypos < src_1->cardinality) {
         const uint16_t arrayval = src_1->array[arraypos];
         while (rle.value + rle.length <
                arrayval) {  // this will frequently be false
             ++rlepos;
             if (rlepos == src_2->n_runs) {
                 return false;  // we are done
             }
             rle = src_2->runs[rlepos];
         }
         if (rle.value > arrayval) {
             arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
                                     rle.value);
         } else {
             return true;
         }
     }
     return false;
 }

 /* Compute the intersection  between src_1 and src_2
  **/
 bool run_bitset_container_intersect(const run_container_t *src_1,
                                        const bitset_container_t *src_2) {
 	   if( run_container_is_full(src_1) ) {
 		   return !bitset_container_empty(src_2);
 	   }
        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
            rle16_t rle = src_1->runs[rlepos];
            if(!bitset_lenrange_empty(src_2->array, rle.value,rle.length)) return true;
        }
        return false;
 }

 /*
  * Compute the intersection between src_1 and src_2 and write the result
  * to *dst. If the return function is true, the result is a bitset_container_t
  * otherwise is a array_container_t.
  */
 bool bitset_bitset_container_intersection(const bitset_container_t *src_1,
                                           const bitset_container_t *src_2,
                                           void **dst) {
     const int newCardinality = bitset_container_and_justcard(src_1, src_2);
     if (newCardinality > DEFAULT_MAX_SIZE) {
         *dst = bitset_container_create();
         if (*dst != NULL) {
             bitset_container_and_nocard(src_1, src_2,
                                         (bitset_container_t *)*dst);
             ((bitset_container_t *)*dst)->cardinality = newCardinality;
         }
         return true;  // it is a bitset
     }
     *dst = array_container_create_given_capacity(newCardinality);
     if (*dst != NULL) {
         ((array_container_t *)*dst)->cardinality = newCardinality;
         bitset_extract_intersection_setbits_uint16(
             ((const bitset_container_t *)src_1)->array,
             ((const bitset_container_t *)src_2)->array,
             BITSET_CONTAINER_SIZE_IN_WORDS, ((array_container_t *)*dst)->array,
             0);
     }
     return false;  // not a bitset
 }

 bool bitset_bitset_container_intersection_inplace(
     bitset_container_t *src_1, const bitset_container_t *src_2, void **dst) {
     const int newCardinality = bitset_container_and_justcard(src_1, src_2);
     if (newCardinality > DEFAULT_MAX_SIZE) {
         *dst = src_1;
         bitset_container_and_nocard(src_1, src_2, src_1);
         ((bitset_container_t *)*dst)->cardinality = newCardinality;
         return true;  // it is a bitset
     }
     *dst = array_container_create_given_capacity(newCardinality);
     if (*dst != NULL) {
         ((array_container_t *)*dst)->cardinality = newCardinality;
         bitset_extract_intersection_setbits_uint16(
             ((const bitset_container_t *)src_1)->array,
             ((const bitset_container_t *)src_2)->array,
             BITSET_CONTAINER_SIZE_IN_WORDS, ((array_container_t *)*dst)->array,
             0);
     }
     return false;  // not a bitset
 }
 /* end file src/containers/mixed_intersection.c */
 /* begin file src/containers/mixed_negation.c */
 /*
  * mixed_negation.c
  *
  */

 #include <assert.h>
 #include <string.h>


 // TODO: make simplified and optimized negation code across
 // the full range.

 /* Negation across the entire range of the container.
  * Compute the  negation of src  and write the result
  * to *dst. The complement of a
  * sufficiently sparse set will always be dense and a hence a bitmap
 ' * We assume that dst is pre-allocated and a valid bitset container
  * There can be no in-place version.
  */
 void array_container_negation(const array_container_t *src,
                               bitset_container_t *dst) {
     uint64_t card = UINT64_C(1 << 16);
     bitset_container_set_all(dst);

     dst->cardinality = (int32_t)bitset_clear_list(dst->array, card, src->array,
                                                   (uint64_t)src->cardinality);
 }

 /* Negation across the entire range of the container
  * Compute the  negation of src  and write the result
  * to *dst.  A true return value indicates a bitset result,
  * otherwise the result is an array container.
  *  We assume that dst is not pre-allocated. In
  * case of failure, *dst will be NULL.
  */
 bool bitset_container_negation(const bitset_container_t *src, void **dst) {
     return bitset_container_negation_range(src, 0, (1 << 16), dst);
 }

 /* inplace version */
 /*
  * Same as bitset_container_negation except that if the output is to
  * be a
  * bitset_container_t, then src is modified and no allocation is made.
  * If the output is to be an array_container_t, then caller is responsible
  * to free the container.
  * In all cases, the result is in *dst.
  */
 bool bitset_container_negation_inplace(bitset_container_t *src, void **dst) {
     return bitset_container_negation_range_inplace(src, 0, (1 << 16), dst);
 }

 /* Negation across the entire range of container
  * Compute the  negation of src  and write the result
  * to *dst.  Return values are the *_TYPECODES as defined * in containers.h
  *  We assume that dst is not pre-allocated. In
  * case of failure, *dst will be NULL.
  */
 int run_container_negation(const run_container_t *src, void **dst) {
     return run_container_negation_range(src, 0, (1 << 16), dst);
 }

 /*
  * Same as run_container_negation except that if the output is to
  * be a
  * run_container_t, and has the capacity to hold the result,
  * then src is modified and no allocation is made.
  * In all cases, the result is in *dst.
  */
 int run_container_negation_inplace(run_container_t *src, void **dst) {
     return run_container_negation_range_inplace(src, 0, (1 << 16), dst);
 }

 /* Negation across a range of the container.
  * Compute the  negation of src  and write the result
  * to *dst. Returns true if the result is a bitset container
  * and false for an array container.  *dst is not preallocated.
  */
 bool array_container_negation_range(const array_container_t *src,
                                     const int range_start, const int range_end,
                                     void **dst) {
     /* close port of the Java implementation */
     if (range_start >= range_end) {
         *dst = array_container_clone(src);
         return false;
     }

     int32_t start_index =
         binarySearch(src->array, src->cardinality, (uint16_t)range_start);
     if (start_index < 0) start_index = -start_index - 1;

     int32_t last_index =
         binarySearch(src->array, src->cardinality, (uint16_t)(range_end - 1));
     if (last_index < 0) last_index = -last_index - 2;

     const int32_t current_values_in_range = last_index - start_index + 1;
     const int32_t span_to_be_flipped = range_end - range_start;
     const int32_t new_values_in_range =
         span_to_be_flipped - current_values_in_range;
     const int32_t cardinality_change =
         new_values_in_range - current_values_in_range;
     const int32_t new_cardinality = src->cardinality + cardinality_change;

     if (new_cardinality > DEFAULT_MAX_SIZE) {
         bitset_container_t *temp = bitset_container_from_array(src);
         bitset_flip_range(temp->array, (uint32_t)range_start,
                           (uint32_t)range_end);
         temp->cardinality = new_cardinality;
         *dst = temp;
         return true;
     }

     array_container_t *arr =
         array_container_create_given_capacity(new_cardinality);
     *dst = (void *)arr;
     if(new_cardinality == 0) {
       arr->cardinality = new_cardinality;
       return false; // we are done.
     }
     // copy stuff before the active area
     memcpy(arr->array, src->array, start_index * sizeof(uint16_t));

     // work on the range
     int32_t out_pos = start_index, in_pos = start_index;
     int32_t val_in_range = range_start;
     for (; val_in_range < range_end && in_pos <= last_index; ++val_in_range) {
         if ((uint16_t)val_in_range != src->array[in_pos]) {
             arr->array[out_pos++] = (uint16_t)val_in_range;
         } else {
             ++in_pos;
         }
     }
     for (; val_in_range < range_end; ++val_in_range)
         arr->array[out_pos++] = (uint16_t)val_in_range;

     // content after the active range
     memcpy(arr->array + out_pos, src->array + (last_index + 1),
            (src->cardinality - (last_index + 1)) * sizeof(uint16_t));
     arr->cardinality = new_cardinality;
     return false;
 }

 /* Even when the result would fit, it is unclear how to make an
  * inplace version without inefficient copying.
  */

 bool array_container_negation_range_inplace(array_container_t *src,
                                             const int range_start,
                                             const int range_end, void **dst) {
     bool ans = array_container_negation_range(src, range_start, range_end, dst);
     // TODO : try a real inplace version
     array_container_free(src);
     return ans;
 }

 /* Negation across a range of the container
  * Compute the  negation of src  and write the result
  * to *dst.  A true return value indicates a bitset result,
  * otherwise the result is an array container.
  *  We assume that dst is not pre-allocated. In
  * case of failure, *dst will be NULL.
  */
 bool bitset_container_negation_range(const bitset_container_t *src,
                                      const int range_start, const int range_end,
                                      void **dst) {
     // TODO maybe consider density-based estimate
     // and sometimes build result directly as array, with
     // conversion back to bitset if wrong.  Or determine
     // actual result cardinality, then go directly for the known final cont.

     // keep computation using bitsets as long as possible.
     bitset_container_t *t = bitset_container_clone(src);
     bitset_flip_range(t->array, (uint32_t)range_start, (uint32_t)range_end);
     t->cardinality = bitset_container_compute_cardinality(t);

     if (t->cardinality > DEFAULT_MAX_SIZE) {
         *dst = t;
         return true;
     } else {
         *dst = array_container_from_bitset(t);
         bitset_container_free(t);
         return false;
     }
 }

 /* inplace version */
 /*
  * Same as bitset_container_negation except that if the output is to
  * be a
  * bitset_container_t, then src is modified and no allocation is made.
  * If the output is to be an array_container_t, then caller is responsible
  * to free the container.
  * In all cases, the result is in *dst.
  */
 bool bitset_container_negation_range_inplace(bitset_container_t *src,
                                              const int range_start,
                                              const int range_end, void **dst) {
     bitset_flip_range(src->array, (uint32_t)range_start, (uint32_t)range_end);
     src->cardinality = bitset_container_compute_cardinality(src);
     if (src->cardinality > DEFAULT_MAX_SIZE) {
         *dst = src;
         return true;
     }
     *dst = array_container_from_bitset(src);
     bitset_container_free(src);
     return false;
 }

 /* Negation across a range of container
  * Compute the  negation of src  and write the result
  * to *dst. Return values are the *_TYPECODES as defined * in containers.h
  *  We assume that dst is not pre-allocated. In
  * case of failure, *dst will be NULL.
  */
 int run_container_negation_range(const run_container_t *src,
                                  const int range_start, const int range_end,
                                  void **dst) {
     uint8_t return_typecode;

     // follows the Java implementation
     if (range_end <= range_start) {
         *dst = run_container_clone(src);
         return RUN_CONTAINER_TYPE_CODE;
     }

     run_container_t *ans = run_container_create_given_capacity(
         src->n_runs + 1);  // src->n_runs + 1);
     int k = 0;
     for (; k < src->n_runs && src->runs[k].value < range_start; ++k) {
         ans->runs[k] = src->runs[k];
         ans->n_runs++;
     }

     run_container_smart_append_exclusive(
         ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));

     for (; k < src->n_runs; ++k) {
         run_container_smart_append_exclusive(ans, src->runs[k].value,
                                              src->runs[k].length);
     }

     *dst = convert_run_to_efficient_container(ans, &return_typecode);
     if (return_typecode != RUN_CONTAINER_TYPE_CODE) run_container_free(ans);

     return return_typecode;
 }

 /*
  * Same as run_container_negation except that if the output is to
  * be a
  * run_container_t, and has the capacity to hold the result,
  * then src is modified and no allocation is made.
  * In all cases, the result is in *dst.
  */
 int run_container_negation_range_inplace(run_container_t *src,
                                          const int range_start,
                                          const int range_end, void **dst) {
     uint8_t return_typecode;

     if (range_end <= range_start) {
         *dst = src;
         return RUN_CONTAINER_TYPE_CODE;
     }

     // TODO: efficient special case when range is 0 to 65535 inclusive

     if (src->capacity == src->n_runs) {
         // no excess room.  More checking to see if result can fit
         bool last_val_before_range = false;
         bool first_val_in_range = false;
         bool last_val_in_range = false;
         bool first_val_past_range = false;

         if (range_start > 0)
             last_val_before_range =
                 run_container_contains(src, (uint16_t)(range_start - 1));
         first_val_in_range = run_container_contains(src, (uint16_t)range_start);

         if (last_val_before_range == first_val_in_range) {
             last_val_in_range =
                 run_container_contains(src, (uint16_t)(range_end - 1));
             if (range_end != 0x10000)
                 first_val_past_range =
                     run_container_contains(src, (uint16_t)range_end);

             if (last_val_in_range ==
                 first_val_past_range) {  // no space for inplace
                 int ans = run_container_negation_range(src, range_start,
                                                        range_end, dst);
                 run_container_free(src);
                 return ans;
             }
         }
     }
     // all other cases: result will fit

     run_container_t *ans = src;
     int my_nbr_runs = src->n_runs;

     ans->n_runs = 0;
     int k = 0;
     for (; (k < my_nbr_runs) && (src->runs[k].value < range_start); ++k) {
         // ans->runs[k] = src->runs[k]; (would be self-copy)
         ans->n_runs++;
     }

     // as with Java implementation, use locals to give self a buffer of depth 1
     rle16_t buffered = (rle16_t){.value = (uint16_t)0, .length = (uint16_t)0};
     rle16_t next = buffered;
     if (k < my_nbr_runs) buffered = src->runs[k];

     run_container_smart_append_exclusive(
         ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));

     for (; k < my_nbr_runs; ++k) {
         if (k + 1 < my_nbr_runs) next = src->runs[k + 1];

         run_container_smart_append_exclusive(ans, buffered.value,
                                              buffered.length);
         buffered = next;
     }

     *dst = convert_run_to_efficient_container(ans, &return_typecode);
     if (return_typecode != RUN_CONTAINER_TYPE_CODE) run_container_free(ans);

     return return_typecode;
 }
 /* end file src/containers/mixed_negation.c */
 /* begin file src/containers/mixed_subset.c */

 bool array_container_is_subset_bitset(const array_container_t* container1,
                                       const bitset_container_t* container2) {
     if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
         if (container2->cardinality < container1->cardinality) {
             return false;
         }
     }
     for (int i = 0; i < container1->cardinality; ++i) {
         if (!bitset_container_contains(container2, container1->array[i])) {
             return false;
         }
     }
     return true;
 }

 bool run_container_is_subset_array(const run_container_t* container1,
                                    const array_container_t* container2) {
     if (run_container_cardinality(container1) > container2->cardinality)
         return false;
     int32_t start_pos = -1, stop_pos = -1;
     for (int i = 0; i < container1->n_runs; ++i) {
         int32_t start = container1->runs[i].value;
         int32_t stop = start + container1->runs[i].length;
         start_pos = advanceUntil(container2->array, stop_pos,
                                  container2->cardinality, start);
         stop_pos = advanceUntil(container2->array, stop_pos,
                                 container2->cardinality, stop);
         if (start_pos == container2->cardinality) {
             return false;
         } else if (stop_pos - start_pos != stop - start ||
                    container2->array[start_pos] != start ||
                    container2->array[stop_pos] != stop) {
             return false;
         }
     }
     return true;
 }

 bool array_container_is_subset_run(const array_container_t* container1,
                                    const run_container_t* container2) {
     if (container1->cardinality > run_container_cardinality(container2))
         return false;
     int i_array = 0, i_run = 0;
     while (i_array < container1->cardinality && i_run < container2->n_runs) {
         uint32_t start = container2->runs[i_run].value;
         uint32_t stop = start + container2->runs[i_run].length;
         if (container1->array[i_array] < start) {
             return false;
         } else if (container1->array[i_array] > stop) {
             i_run++;
         } else {  // the value of the array is in the run
             i_array++;
         }
     }
     if (i_array == container1->cardinality) {
         return true;
     } else {
         return false;
     }
 }

 bool run_container_is_subset_bitset(const run_container_t* container1,
                                     const bitset_container_t* container2) {
     // todo: this code could be much faster
     if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
         if (container2->cardinality < run_container_cardinality(container1)) {
             return false;
         }
     } else {
         int32_t card = bitset_container_compute_cardinality(
             container2);  // modify container2?
         if (card < run_container_cardinality(container1)) {
             return false;
         }
     }
     for (int i = 0; i < container1->n_runs; ++i) {
         uint32_t run_start = container1->runs[i].value;
         uint32_t le = container1->runs[i].length;
         for (uint32_t j = run_start; j <= run_start + le; ++j) {
             if (!bitset_container_contains(container2, j)) {
                 return false;
             }
         }
     }
     return true;
 }

 bool bitset_container_is_subset_run(const bitset_container_t* container1,
                                     const run_container_t* container2) {
     // todo: this code could be much faster
     if (container1->cardinality != BITSET_UNKNOWN_CARDINALITY) {
         if (container1->cardinality > run_container_cardinality(container2)) {
             return false;
         }
     }
     int32_t i_bitset = 0, i_run = 0;
     while (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS &&
            i_run < container2->n_runs) {
         uint64_t w = container1->array[i_bitset];
         while (w != 0 && i_run < container2->n_runs) {
             uint32_t start = container2->runs[i_run].value;
             uint32_t stop = start + container2->runs[i_run].length;
             uint64_t t = w & (~w + 1);
             uint16_t r = i_bitset * 64 + __builtin_ctzll(w);
             if (r < start) {
                 return false;
             } else if (r > stop) {
                 i_run++;
                 continue;
             } else {
                 w ^= t;
             }
         }
         if (w == 0) {
             i_bitset++;
         } else {
             return false;
         }
     }
     if (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS) {
         // terminated iterating on the run containers, check that rest of bitset
         // is empty
         for (; i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS; i_bitset++) {
             if (container1->array[i_bitset] != 0) {
                 return false;
             }
         }
     }
     return true;
 }
 /* end file src/containers/mixed_subset.c */
 /* begin file src/containers/mixed_union.c */
 /*
  * mixed_union.c
  *
  */

 #include <assert.h>
 #include <string.h>


 /* Compute the union of src_1 and src_2 and write the result to
  * dst.  */
 void array_bitset_container_union(const array_container_t *src_1,
                                   const bitset_container_t *src_2,
                                   bitset_container_t *dst) {
     if (src_2 != dst) bitset_container_copy(src_2, dst);
     dst->cardinality = (int32_t)bitset_set_list_withcard(
         dst->array, dst->cardinality, src_1->array, src_1->cardinality);
 }

 /* Compute the union of src_1 and src_2 and write the result to
  * dst. It is allowed for src_2 to be dst.  This version does not
  * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */
 void array_bitset_container_lazy_union(const array_container_t *src_1,
                                        const bitset_container_t *src_2,
                                        bitset_container_t *dst) {
     if (src_2 != dst) bitset_container_copy(src_2, dst);
     bitset_set_list(dst->array, src_1->array, src_1->cardinality);
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
 }

 void run_bitset_container_union(const run_container_t *src_1,
                                 const bitset_container_t *src_2,
                                 bitset_container_t *dst) {
     assert(!run_container_is_full(src_1));  // catch this case upstream
     if (src_2 != dst) bitset_container_copy(src_2, dst);
     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
         rle16_t rle = src_1->runs[rlepos];
         bitset_set_lenrange(dst->array, rle.value, rle.length);
     }
     dst->cardinality = bitset_container_compute_cardinality(dst);
 }

 void run_bitset_container_lazy_union(const run_container_t *src_1,
                                      const bitset_container_t *src_2,
                                      bitset_container_t *dst) {
     assert(!run_container_is_full(src_1));  // catch this case upstream
     if (src_2 != dst) bitset_container_copy(src_2, dst);
     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
         rle16_t rle = src_1->runs[rlepos];
         bitset_set_lenrange(dst->array, rle.value, rle.length);
     }
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
 }

 // why do we leave the result as a run container??
 void array_run_container_union(const array_container_t *src_1,
                                const run_container_t *src_2,
                                run_container_t *dst) {
     if (run_container_is_full(src_2)) {
         run_container_copy(src_2, dst);
         return;
     }
     // TODO: see whether the "2*" is spurious
     run_container_grow(dst, 2 * (src_1->cardinality + src_2->n_runs), false);
     int32_t rlepos = 0;
     int32_t arraypos = 0;
     rle16_t previousrle;
     if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
         previousrle = run_container_append_first(dst, src_2->runs[rlepos]);
         rlepos++;
     } else {
         previousrle =
             run_container_append_value_first(dst, src_1->array[arraypos]);
         arraypos++;
     }
     while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
         if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
             run_container_append(dst, src_2->runs[rlepos], &previousrle);
             rlepos++;
         } else {
             run_container_append_value(dst, src_1->array[arraypos],
                                        &previousrle);
             arraypos++;
         }
     }
     if (arraypos < src_1->cardinality) {
         while (arraypos < src_1->cardinality) {
             run_container_append_value(dst, src_1->array[arraypos],
                                        &previousrle);
             arraypos++;
         }
     } else {
         while (rlepos < src_2->n_runs) {
             run_container_append(dst, src_2->runs[rlepos], &previousrle);
             rlepos++;
         }
     }
 }

 void array_run_container_inplace_union(const array_container_t *src_1,
                                        run_container_t *src_2) {
     if (run_container_is_full(src_2)) {
         return;
     }
     const int32_t maxoutput = src_1->cardinality + src_2->n_runs;
     const int32_t neededcapacity = maxoutput + src_2->n_runs;
     if (src_2->capacity < neededcapacity)
         run_container_grow(src_2, neededcapacity, true);
     memmove(src_2->runs + maxoutput, src_2->runs,
             src_2->n_runs * sizeof(rle16_t));
     rle16_t *inputsrc2 = src_2->runs + maxoutput;
     int32_t rlepos = 0;
     int32_t arraypos = 0;
     int src2nruns = src_2->n_runs;
     src_2->n_runs = 0;

     rle16_t previousrle;

     if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
         previousrle = run_container_append_first(src_2, inputsrc2[rlepos]);
         rlepos++;
     } else {
         previousrle =
             run_container_append_value_first(src_2, src_1->array[arraypos]);
         arraypos++;
     }

     while ((rlepos < src2nruns) && (arraypos < src_1->cardinality)) {
         if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
             run_container_append(src_2, inputsrc2[rlepos], &previousrle);
             rlepos++;
         } else {
             run_container_append_value(src_2, src_1->array[arraypos],
                                        &previousrle);
             arraypos++;
         }
     }
     if (arraypos < src_1->cardinality) {
         while (arraypos < src_1->cardinality) {
             run_container_append_value(src_2, src_1->array[arraypos],
                                        &previousrle);
             arraypos++;
         }
     } else {
         while (rlepos < src2nruns) {
             run_container_append(src_2, inputsrc2[rlepos], &previousrle);
             rlepos++;
         }
     }
 }

 bool array_array_container_union(const array_container_t *src_1,
                                  const array_container_t *src_2, void **dst) {
     int totalCardinality = src_1->cardinality + src_2->cardinality;
     if (totalCardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_create_given_capacity(totalCardinality);
         if (*dst != NULL) {
             array_container_union(src_1, src_2, (array_container_t *)*dst);
         } else {
             return true; // otherwise failure won't be caught
         }
         return false;  // not a bitset
     }
     *dst = bitset_container_create();
     bool returnval = true;  // expect a bitset
     if (*dst != NULL) {
         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
         ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
             ourbitset->array, src_1->cardinality, src_2->array,
             src_2->cardinality);
         if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
             // need to convert!
             *dst = array_container_from_bitset(ourbitset);
             bitset_container_free(ourbitset);
             returnval = false;  // not going to be a bitset
         }
     }
     return returnval;
 }

 bool array_array_container_inplace_union(array_container_t *src_1,
                                  const array_container_t *src_2, void **dst) {
     int totalCardinality = src_1->cardinality + src_2->cardinality;
     *dst = NULL;
     if (totalCardinality <= DEFAULT_MAX_SIZE) {
         if(src_1->capacity < totalCardinality) {
           *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
           if (*dst != NULL) {
               array_container_union(src_1, src_2, (array_container_t *)*dst);
           } else {
               return true; // otherwise failure won't be caught
           }
           return false;  // not a bitset
         } else {
           memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
           src_1->cardinality = (int32_t)union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
                                   src_2->array, src_2->cardinality, src_1->array);
           return false; // not a bitset
         }
     }
     *dst = bitset_container_create();
     bool returnval = true;  // expect a bitset
     if (*dst != NULL) {
         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
         ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
             ourbitset->array, src_1->cardinality, src_2->array,
             src_2->cardinality);
         if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
             // need to convert!
             if(src_1->capacity < ourbitset->cardinality) {
               array_container_grow(src_1, ourbitset->cardinality, false);
             }

             bitset_extract_setbits_uint16(ourbitset->array, BITSET_CONTAINER_SIZE_IN_WORDS,
                                   src_1->array, 0);
             src_1->cardinality =  ourbitset->cardinality;
             *dst = src_1;
             bitset_container_free(ourbitset);
             returnval = false;  // not going to be a bitset
         }
     }
     return returnval;
 }


 bool array_array_container_lazy_union(const array_container_t *src_1,
                                       const array_container_t *src_2,
                                       void **dst) {
     int totalCardinality = src_1->cardinality + src_2->cardinality;
     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
         *dst = array_container_create_given_capacity(totalCardinality);
         if (*dst != NULL) {
             array_container_union(src_1, src_2, (array_container_t *)*dst);
         } else {
               return true; // otherwise failure won't be caught
         }
         return false;  // not a bitset
     }
     *dst = bitset_container_create();
     bool returnval = true;  // expect a bitset
     if (*dst != NULL) {
         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
         bitset_set_list(ourbitset->array, src_2->array, src_2->cardinality);
         ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
     }
     return returnval;
 }


 bool array_array_container_lazy_inplace_union(array_container_t *src_1,
                                       const array_container_t *src_2,
                                       void **dst) {
     int totalCardinality = src_1->cardinality + src_2->cardinality;
     *dst = NULL;
     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
         if(src_1->capacity < totalCardinality) {
           *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
           if (*dst != NULL) {
               array_container_union(src_1, src_2, (array_container_t *)*dst);
           } else {
             return true; // otherwise failure won't be caught
           }
           return false;  // not a bitset
         } else {
           memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
           src_1->cardinality = (int32_t)union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
                                   src_2->array, src_2->cardinality, src_1->array);
           return false; // not a bitset
         }
     }
     *dst = bitset_container_create();
     bool returnval = true;  // expect a bitset
     if (*dst != NULL) {
         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
         bitset_set_list(ourbitset->array, src_2->array, src_2->cardinality);
         ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
     }
     return returnval;
 }
 /* end file src/containers/mixed_union.c */
 /* begin file src/containers/mixed_xor.c */
 /*
  * mixed_xor.c
  */

 #include <assert.h>
 #include <string.h>


 /* Compute the xor of src_1 and src_2 and write the result to
  * dst (which has no container initially).
  * Result is true iff dst is a bitset  */
 bool array_bitset_container_xor(const array_container_t *src_1,
                                 const bitset_container_t *src_2, void **dst) {
     bitset_container_t *result = bitset_container_create();
     bitset_container_copy(src_2, result);
     result->cardinality = (int32_t)bitset_flip_list_withcard(
         result->array, result->cardinality, src_1->array, src_1->cardinality);

     // do required type conversions.
     if (result->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(result);
         bitset_container_free(result);
         return false;  // not bitset
     }
     *dst = result;
     return true;  // bitset
 }

 /* Compute the xor of src_1 and src_2 and write the result to
  * dst. It is allowed for src_2 to be dst.  This version does not
  * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
  */

 void array_bitset_container_lazy_xor(const array_container_t *src_1,
                                      const bitset_container_t *src_2,
                                      bitset_container_t *dst) {
     if (src_2 != dst) bitset_container_copy(src_2, dst);
     bitset_flip_list(dst->array, src_1->array, src_1->cardinality);
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
 }

 /* Compute the xor of src_1 and src_2 and write the result to
  * dst. Result may be either a bitset or an array container
  * (returns "result is bitset"). dst does not initially have
  * any container, but becomes either a bitset container (return
  * result true) or an array container.
  */

 bool run_bitset_container_xor(const run_container_t *src_1,
                               const bitset_container_t *src_2, void **dst) {
     bitset_container_t *result = bitset_container_create();

     bitset_container_copy(src_2, result);
     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
         rle16_t rle = src_1->runs[rlepos];
         bitset_flip_range(result->array, rle.value,
                           rle.value + rle.length + UINT32_C(1));
     }
     result->cardinality = bitset_container_compute_cardinality(result);

     if (result->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(result);
         bitset_container_free(result);
         return false;  // not bitset
     }
     *dst = result;
     return true;  // bitset
 }

 /* lazy xor.  Dst is initialized and may be equal to src_2.
  *  Result is left as a bitset container, even if actual
  *  cardinality would dictate an array container.
  */

 void run_bitset_container_lazy_xor(const run_container_t *src_1,
                                    const bitset_container_t *src_2,
                                    bitset_container_t *dst) {
     if (src_2 != dst) bitset_container_copy(src_2, dst);
     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
         rle16_t rle = src_1->runs[rlepos];
         bitset_flip_range(dst->array, rle.value,
                           rle.value + rle.length + UINT32_C(1));
     }
     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
 }

 /* dst does not indicate a valid container initially.  Eventually it
  * can become any kind of container.
  */

 int array_run_container_xor(const array_container_t *src_1,
                             const run_container_t *src_2, void **dst) {
     // semi following Java XOR implementation as of May 2016
     // the C OR implementation works quite differently and can return a run
     // container
     // TODO could optimize for full run containers.

     // use of lazy following Java impl.
     const int arbitrary_threshold = 32;
     if (src_1->cardinality < arbitrary_threshold) {
         run_container_t *ans = run_container_create();
         array_run_container_lazy_xor(src_1, src_2, ans);  // keeps runs.
         uint8_t typecode_after;
         *dst =
             convert_run_to_efficient_container_and_free(ans, &typecode_after);
         return typecode_after;
     }

     int card = run_container_cardinality(src_2);
     if (card <= DEFAULT_MAX_SIZE) {
         // Java implementation works with the array, xoring the run elements via
         // iterator
         array_container_t *temp = array_container_from_run(src_2);
         bool ret_is_bitset = array_array_container_xor(temp, src_1, dst);
         array_container_free(temp);
         return ret_is_bitset ? BITSET_CONTAINER_TYPE_CODE
                              : ARRAY_CONTAINER_TYPE_CODE;

     } else {  // guess that it will end up as a bitset
         bitset_container_t *result = bitset_container_from_run(src_2);
         bool is_bitset = bitset_array_container_ixor(result, src_1, dst);
         // any necessary type conversion has been done by the ixor
         int retval = (is_bitset ? BITSET_CONTAINER_TYPE_CODE
                                 : ARRAY_CONTAINER_TYPE_CODE);
         return retval;
     }
 }

 /* Dst is a valid run container. (Can it be src_2? Let's say not.)
  * Leaves result as run container, even if other options are
  * smaller.
  */

 void array_run_container_lazy_xor(const array_container_t *src_1,
                                   const run_container_t *src_2,
                                   run_container_t *dst) {
     run_container_grow(dst, src_1->cardinality + src_2->n_runs, false);
     int32_t rlepos = 0;
     int32_t arraypos = 0;
     dst->n_runs = 0;

     while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
         if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
             run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
                                                  src_2->runs[rlepos].length);
             rlepos++;
         } else {
             run_container_smart_append_exclusive(dst, src_1->array[arraypos],
                                                  0);
             arraypos++;
         }
     }
     while (arraypos < src_1->cardinality) {
         run_container_smart_append_exclusive(dst, src_1->array[arraypos], 0);
         arraypos++;
     }
     while (rlepos < src_2->n_runs) {
         run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
                                              src_2->runs[rlepos].length);
         rlepos++;
     }
 }

 /* dst does not indicate a valid container initially.  Eventually it
  * can become any kind of container.
  */

 int run_run_container_xor(const run_container_t *src_1,
                           const run_container_t *src_2, void **dst) {
     run_container_t *ans = run_container_create();
     run_container_xor(src_1, src_2, ans);
     uint8_t typecode_after;
     *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
     return typecode_after;
 }

 /*
  * Java implementation (as of May 2016) for array_run, run_run
  * and  bitset_run don't do anything different for inplace.
  * Could adopt the mixed_union.c approach instead (ie, using
  * smart_append_exclusive)
  *
  */

 bool array_array_container_xor(const array_container_t *src_1,
                                const array_container_t *src_2, void **dst) {
     int totalCardinality =
         src_1->cardinality + src_2->cardinality;  // upper bound
     if (totalCardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_create_given_capacity(totalCardinality);
         array_container_xor(src_1, src_2, (array_container_t *)*dst);
         return false;  // not a bitset
     }
     *dst = bitset_container_from_array(src_1);
     bool returnval = true;  // expect a bitset
     bitset_container_t *ourbitset = (bitset_container_t *)*dst;
     ourbitset->cardinality = (uint32_t)bitset_flip_list_withcard(
         ourbitset->array, src_1->cardinality, src_2->array, src_2->cardinality);
     if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
         // need to convert!
         *dst = array_container_from_bitset(ourbitset);
         bitset_container_free(ourbitset);
         returnval = false;  // not going to be a bitset
     }

     return returnval;
 }

 bool array_array_container_lazy_xor(const array_container_t *src_1,
                                     const array_container_t *src_2,
                                     void **dst) {
     int totalCardinality = src_1->cardinality + src_2->cardinality;
     // upper bound, but probably poor estimate for xor
     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
         *dst = array_container_create_given_capacity(totalCardinality);
         if (*dst != NULL)
             array_container_xor(src_1, src_2, (array_container_t *)*dst);
         return false;  // not a bitset
     }
     *dst = bitset_container_from_array(src_1);
     bool returnval = true;  // expect a bitset (maybe, for XOR??)
     if (*dst != NULL) {
         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
         bitset_flip_list(ourbitset->array, src_2->array, src_2->cardinality);
         ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
     }
     return returnval;
 }

 /* Compute the xor of src_1 and src_2 and write the result to
  * dst (which has no container initially). Return value is
  * "dst is a bitset"
  */

 bool bitset_bitset_container_xor(const bitset_container_t *src_1,
                                  const bitset_container_t *src_2, void **dst) {
     bitset_container_t *ans = bitset_container_create();
     int card = bitset_container_xor(src_1, src_2, ans);
     if (card <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(ans);
         bitset_container_free(ans);
         return false;  // not bitset
     } else {
         *dst = ans;
         return true;
     }
 }

 /* Compute the xor of src_1 and src_2 and write the result to
  * dst (which has no container initially).  It will modify src_1
  * to be dst if the result is a bitset.  Otherwise, it will
  * free src_1 and dst will be a new array container.  In both
  * cases, the caller is responsible for deallocating dst.
  * Returns true iff dst is a bitset  */

 bool bitset_array_container_ixor(bitset_container_t *src_1,
                                  const array_container_t *src_2, void **dst) {
     *dst = src_1;
     src_1->cardinality = (uint32_t)bitset_flip_list_withcard(
         src_1->array, src_1->cardinality, src_2->array, src_2->cardinality);

     if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
         *dst = array_container_from_bitset(src_1);
         bitset_container_free(src_1);
         return false;  // not bitset
     } else
         return true;
 }

 /* a bunch of in-place, some of which may not *really* be inplace.
  * TODO: write actual inplace routine if efficiency warrants it
  * Anything inplace with a bitset is a good candidate
  */

 bool bitset_bitset_container_ixor(bitset_container_t *src_1,
                                   const bitset_container_t *src_2, void **dst) {
     bool ans = bitset_bitset_container_xor(src_1, src_2, dst);
     bitset_container_free(src_1);
     return ans;
 }

 bool array_bitset_container_ixor(array_container_t *src_1,
                                  const bitset_container_t *src_2, void **dst) {
     bool ans = array_bitset_container_xor(src_1, src_2, dst);
     array_container_free(src_1);
     return ans;
 }

 /* Compute the xor of src_1 and src_2 and write the result to
  * dst. Result may be either a bitset or an array container
  * (returns "result is bitset"). dst does not initially have
  * any container, but becomes either a bitset container (return
  * result true) or an array container.
  */

 bool run_bitset_container_ixor(run_container_t *src_1,
                                const bitset_container_t *src_2, void **dst) {
     bool ans = run_bitset_container_xor(src_1, src_2, dst);
     run_container_free(src_1);
     return ans;
 }

 bool bitset_run_container_ixor(bitset_container_t *src_1,
                                const run_container_t *src_2, void **dst) {
     bool ans = run_bitset_container_xor(src_2, src_1, dst);
     bitset_container_free(src_1);
     return ans;
 }

 /* dst does not indicate a valid container initially.  Eventually it
  * can become any kind of container.
  */

 int array_run_container_ixor(array_container_t *src_1,
                              const run_container_t *src_2, void **dst) {
     int ans = array_run_container_xor(src_1, src_2, dst);
     array_container_free(src_1);
     return ans;
 }

 int run_array_container_ixor(run_container_t *src_1,
                              const array_container_t *src_2, void **dst) {
     int ans = array_run_container_xor(src_2, src_1, dst);
     run_container_free(src_1);
     return ans;
 }

 bool array_array_container_ixor(array_container_t *src_1,
                                 const array_container_t *src_2, void **dst) {
     bool ans = array_array_container_xor(src_1, src_2, dst);
     array_container_free(src_1);
     return ans;
 }

 int run_run_container_ixor(run_container_t *src_1, const run_container_t *src_2,
                            void **dst) {
     int ans = run_run_container_xor(src_1, src_2, dst);
     run_container_free(src_1);
     return ans;
 }
 /* end file src/containers/mixed_xor.c */
 /* begin file src/containers/run.c */
 #include <stdio.h>
 #include <stdlib.h>


 bool run_container_add(run_container_t *run, uint16_t pos) {
     int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
     if (index >= 0) return false;  // already there
     index = -index - 2;            // points to preceding value, possibly -1
     if (index >= 0) {              // possible match
         int32_t offset = pos - run->runs[index].value;
         int32_t le = run->runs[index].length;
         if (offset <= le) return false;  // already there
         if (offset == le + 1) {
             // we may need to fuse
             if (index + 1 < run->n_runs) {
                 if (run->runs[index + 1].value == pos + 1) {
                     // indeed fusion is needed
                     run->runs[index].length = run->runs[index + 1].value +
                                               run->runs[index + 1].length -
                                               run->runs[index].value;
                     recoverRoomAtIndex(run, (uint16_t)(index + 1));
                     return true;
                 }
             }
             run->runs[index].length++;
             return true;
         }
         if (index + 1 < run->n_runs) {
             // we may need to fuse
             if (run->runs[index + 1].value == pos + 1) {
                 // indeed fusion is needed
                 run->runs[index + 1].value = pos;
                 run->runs[index + 1].length = run->runs[index + 1].length + 1;
                 return true;
             }
         }
     }
     if (index == -1) {
         // we may need to extend the first run
         if (0 < run->n_runs) {
             if (run->runs[0].value == pos + 1) {
                 run->runs[0].length++;
                 run->runs[0].value--;
                 return true;
             }
         }
     }
     makeRoomAtIndex(run, (uint16_t)(index + 1));
     run->runs[index + 1].value = pos;
     run->runs[index + 1].length = 0;
     return true;
 }

 /* Create a new run container. Return NULL in case of failure. */
 run_container_t *run_container_create_given_capacity(int32_t size) {
     run_container_t *run;
     /* Allocate the run container itself. */
     run = (run_container_t *)malloc(sizeof(run_container_t));
     assert (run);
     if (size <= 0) // we don't want to rely on malloc(0)
         run->runs = NULL;
     run->runs = (rle16_t *)malloc(sizeof(rle16_t) * size);
     assert (run->runs);
     run->capacity = size;
     run->n_runs = 0;
     return run;
 }

 int run_container_shrink_to_fit(run_container_t *src) {
     if (src->n_runs == src->capacity) return 0;  // nothing to do
     int savings = src->capacity - src->n_runs;
     src->capacity = src->n_runs;
     rle16_t *oldruns = src->runs;
     src->runs = (rle16_t *)realloc(oldruns, src->capacity * sizeof(rle16_t));
     if (src->runs == NULL) free(oldruns);  // should never happen?
     return savings;
 }
 /* Create a new run container. Return NULL in case of failure. */
 run_container_t *run_container_create(void) {
     return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE);
 }

 run_container_t *run_container_clone(const run_container_t *src) {
     run_container_t *run = run_container_create_given_capacity(src->capacity);
     if (run == NULL) return NULL;
     run->capacity = src->capacity;
     run->n_runs = src->n_runs;
     memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t));
     return run;
 }

 /* Free memory. */
 void run_container_free(run_container_t *run) {
     if(run->runs != NULL) {// Jon Strabala reports that some tools complain otherwise
       free(run->runs);
       run->runs = NULL;  // pedantic
     }
     free(run);
 }

 void run_container_grow(run_container_t *run, int32_t min, bool copy) {
     int32_t newCapacity =
         (run->capacity == 0)
             ? RUN_DEFAULT_INIT_SIZE
             : run->capacity < 64 ? run->capacity * 2
                                  : run->capacity < 1024 ? run->capacity * 3 / 2
                                                         : run->capacity * 5 / 4;
     if (newCapacity < min) newCapacity = min;
     run->capacity = newCapacity;
     assert(run->capacity >= min);
     if (copy) {
         rle16_t *oldruns = run->runs;
         run->runs =
             (rle16_t *)realloc(oldruns, run->capacity * sizeof(rle16_t));
         if (run->runs == NULL) free(oldruns);
     } else {
         // Jon Strabala reports that some tools complain otherwise
         if (run->runs != NULL) {
           free(run->runs);
         }
         run->runs = (rle16_t *)malloc(run->capacity * sizeof(rle16_t));
     }
     // handle the case where realloc fails
     if (run->runs == NULL) {
       fprintf(stderr, "could not allocate memory\n");
     }
     assert(run->runs != NULL);
 }

 /* copy one container into another */
 void run_container_copy(const run_container_t *src, run_container_t *dst) {
     const int32_t n_runs = src->n_runs;
     if (src->n_runs > dst->capacity) {
         run_container_grow(dst, n_runs, false);
     }
     dst->n_runs = n_runs;
     memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs);
 }

 /* Compute the union of `src_1' and `src_2' and write the result to `dst'
  * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
 void run_container_union(const run_container_t *src_1,
                          const run_container_t *src_2, run_container_t *dst) {
     // TODO: this could be a lot more efficient

     // we start out with inexpensive checks
     const bool if1 = run_container_is_full(src_1);
     const bool if2 = run_container_is_full(src_2);
     if (if1 || if2) {
         if (if1) {
             run_container_copy(src_1, dst);
             return;
         }
         if (if2) {
             run_container_copy(src_2, dst);
             return;
         }
     }
     const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
     if (dst->capacity < neededcapacity)
         run_container_grow(dst, neededcapacity, false);
     dst->n_runs = 0;
     int32_t rlepos = 0;
     int32_t xrlepos = 0;

     rle16_t previousrle;
     if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
         previousrle = run_container_append_first(dst, src_1->runs[rlepos]);
         rlepos++;
     } else {
         previousrle = run_container_append_first(dst, src_2->runs[xrlepos]);
         xrlepos++;
     }

     while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) {
         rle16_t newrl;
         if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
             newrl = src_1->runs[rlepos];
             rlepos++;
         } else {
             newrl = src_2->runs[xrlepos];
             xrlepos++;
         }
         run_container_append(dst, newrl, &previousrle);
     }
     while (xrlepos < src_2->n_runs) {
         run_container_append(dst, src_2->runs[xrlepos], &previousrle);
         xrlepos++;
     }
     while (rlepos < src_1->n_runs) {
         run_container_append(dst, src_1->runs[rlepos], &previousrle);
         rlepos++;
     }
 }

 /* Compute the union of `src_1' and `src_2' and write the result to `src_1'
  */
 void run_container_union_inplace(run_container_t *src_1,
                                  const run_container_t *src_2) {
     // TODO: this could be a lot more efficient

     // we start out with inexpensive checks
     const bool if1 = run_container_is_full(src_1);
     const bool if2 = run_container_is_full(src_2);
     if (if1 || if2) {
         if (if1) {
             return;
         }
         if (if2) {
             run_container_copy(src_2, src_1);
             return;
         }
     }
     // we move the data to the end of the current array
     const int32_t maxoutput = src_1->n_runs + src_2->n_runs;
     const int32_t neededcapacity = maxoutput + src_1->n_runs;
     if (src_1->capacity < neededcapacity)
         run_container_grow(src_1, neededcapacity, true);
     memmove(src_1->runs + maxoutput, src_1->runs,
             src_1->n_runs * sizeof(rle16_t));
     rle16_t *inputsrc1 = src_1->runs + maxoutput;
     const int32_t input1nruns = src_1->n_runs;
     src_1->n_runs = 0;
     int32_t rlepos = 0;
     int32_t xrlepos = 0;

     rle16_t previousrle;
     if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
         previousrle = run_container_append_first(src_1, inputsrc1[rlepos]);
         rlepos++;
     } else {
         previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]);
         xrlepos++;
     }
     while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) {
         rle16_t newrl;
         if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
             newrl = inputsrc1[rlepos];
             rlepos++;
         } else {
             newrl = src_2->runs[xrlepos];
             xrlepos++;
         }
         run_container_append(src_1, newrl, &previousrle);
     }
     while (xrlepos < src_2->n_runs) {
         run_container_append(src_1, src_2->runs[xrlepos], &previousrle);
         xrlepos++;
     }
     while (rlepos < input1nruns) {
         run_container_append(src_1, inputsrc1[rlepos], &previousrle);
         rlepos++;
     }
 }

 /* Compute the symmetric difference of `src_1' and `src_2' and write the result
  * to `dst'
  * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
 void run_container_xor(const run_container_t *src_1,
                        const run_container_t *src_2, run_container_t *dst) {
     // don't bother to convert xor with full range into negation
     // since negation is implemented similarly

     const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
     if (dst->capacity < neededcapacity)
         run_container_grow(dst, neededcapacity, false);

     int32_t pos1 = 0;
     int32_t pos2 = 0;
     dst->n_runs = 0;

     while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) {
         if (src_1->runs[pos1].value <= src_2->runs[pos2].value) {
             run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
                                                  src_1->runs[pos1].length);
             pos1++;
         } else {
             run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
                                                  src_2->runs[pos2].length);
             pos2++;
         }
     }
     while (pos1 < src_1->n_runs) {
         run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
                                              src_1->runs[pos1].length);
         pos1++;
     }

     while (pos2 < src_2->n_runs) {
         run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
                                              src_2->runs[pos2].length);
         pos2++;
     }
 }

 /* Compute the intersection of src_1 and src_2 and write the result to
  * dst. It is assumed that dst is distinct from both src_1 and src_2. */
 void run_container_intersection(const run_container_t *src_1,
                                 const run_container_t *src_2,
                                 run_container_t *dst) {
     const bool if1 = run_container_is_full(src_1);
     const bool if2 = run_container_is_full(src_2);
     if (if1 || if2) {
         if (if1) {
             run_container_copy(src_2, dst);
             return;
         }
         if (if2) {
             run_container_copy(src_1, dst);
             return;
         }
     }
     // TODO: this could be a lot more efficient, could use SIMD optimizations
     const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
     if (dst->capacity < neededcapacity)
         run_container_grow(dst, neededcapacity, false);
     dst->n_runs = 0;
     int32_t rlepos = 0;
     int32_t xrlepos = 0;
     int32_t start = src_1->runs[rlepos].value;
     int32_t end = start + src_1->runs[rlepos].length + 1;
     int32_t xstart = src_2->runs[xrlepos].value;
     int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
     while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
         if (end <= xstart) {
             ++rlepos;
             if (rlepos < src_1->n_runs) {
                 start = src_1->runs[rlepos].value;
                 end = start + src_1->runs[rlepos].length + 1;
             }
         } else if (xend <= start) {
             ++xrlepos;
             if (xrlepos < src_2->n_runs) {
                 xstart = src_2->runs[xrlepos].value;
                 xend = xstart + src_2->runs[xrlepos].length + 1;
             }
         } else {  // they overlap
             const int32_t lateststart = start > xstart ? start : xstart;
             int32_t earliestend;
             if (end == xend) {  // improbable
                 earliestend = end;
                 rlepos++;
                 xrlepos++;
                 if (rlepos < src_1->n_runs) {
                     start = src_1->runs[rlepos].value;
                     end = start + src_1->runs[rlepos].length + 1;
                 }
                 if (xrlepos < src_2->n_runs) {
                     xstart = src_2->runs[xrlepos].value;
                     xend = xstart + src_2->runs[xrlepos].length + 1;
                 }
             } else if (end < xend) {
                 earliestend = end;
                 rlepos++;
                 if (rlepos < src_1->n_runs) {
                     start = src_1->runs[rlepos].value;
                     end = start + src_1->runs[rlepos].length + 1;
                 }

             } else {  // end > xend
                 earliestend = xend;
                 xrlepos++;
                 if (xrlepos < src_2->n_runs) {
                     xstart = src_2->runs[xrlepos].value;
                     xend = xstart + src_2->runs[xrlepos].length + 1;
                 }
             }
             dst->runs[dst->n_runs].value = (uint16_t)lateststart;
             dst->runs[dst->n_runs].length =
                 (uint16_t)(earliestend - lateststart - 1);
             dst->n_runs++;
         }
     }
 }

 /* Compute the size of the intersection of src_1 and src_2 . */
 int run_container_intersection_cardinality(const run_container_t *src_1,
                                            const run_container_t *src_2) {
     const bool if1 = run_container_is_full(src_1);
     const bool if2 = run_container_is_full(src_2);
     if (if1 || if2) {
         if (if1) {
             return run_container_cardinality(src_2);
         }
         if (if2) {
             return run_container_cardinality(src_1);
         }
     }
     int answer = 0;
     int32_t rlepos = 0;
     int32_t xrlepos = 0;
     int32_t start = src_1->runs[rlepos].value;
     int32_t end = start + src_1->runs[rlepos].length + 1;
     int32_t xstart = src_2->runs[xrlepos].value;
     int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
     while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
         if (end <= xstart) {
             ++rlepos;
             if (rlepos < src_1->n_runs) {
                 start = src_1->runs[rlepos].value;
                 end = start + src_1->runs[rlepos].length + 1;
             }
         } else if (xend <= start) {
             ++xrlepos;
             if (xrlepos < src_2->n_runs) {
                 xstart = src_2->runs[xrlepos].value;
                 xend = xstart + src_2->runs[xrlepos].length + 1;
             }
         } else {  // they overlap
             const int32_t lateststart = start > xstart ? start : xstart;
             int32_t earliestend;
             if (end == xend) {  // improbable
                 earliestend = end;
                 rlepos++;
                 xrlepos++;
                 if (rlepos < src_1->n_runs) {
                     start = src_1->runs[rlepos].value;
                     end = start + src_1->runs[rlepos].length + 1;
                 }
                 if (xrlepos < src_2->n_runs) {
                     xstart = src_2->runs[xrlepos].value;
                     xend = xstart + src_2->runs[xrlepos].length + 1;
                 }
             } else if (end < xend) {
                 earliestend = end;
                 rlepos++;
                 if (rlepos < src_1->n_runs) {
                     start = src_1->runs[rlepos].value;
                     end = start + src_1->runs[rlepos].length + 1;
                 }

             } else {  // end > xend
                 earliestend = xend;
                 xrlepos++;
                 if (xrlepos < src_2->n_runs) {
                     xstart = src_2->runs[xrlepos].value;
                     xend = xstart + src_2->runs[xrlepos].length + 1;
                 }
             }
             answer += earliestend - lateststart;
         }
     }
     return answer;
 }

 bool run_container_intersect(const run_container_t *src_1,
                                 const run_container_t *src_2) {
     const bool if1 = run_container_is_full(src_1);
     const bool if2 = run_container_is_full(src_2);
     if (if1 || if2) {
         if (if1) {
             return !run_container_empty(src_2);
         }
         if (if2) {
         	return !run_container_empty(src_1);
         }
     }
     int32_t rlepos = 0;
     int32_t xrlepos = 0;
     int32_t start = src_1->runs[rlepos].value;
     int32_t end = start + src_1->runs[rlepos].length + 1;
     int32_t xstart = src_2->runs[xrlepos].value;
     int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
     while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
         if (end <= xstart) {
             ++rlepos;
             if (rlepos < src_1->n_runs) {
                 start = src_1->runs[rlepos].value;
                 end = start + src_1->runs[rlepos].length + 1;
             }
         } else if (xend <= start) {
             ++xrlepos;
             if (xrlepos < src_2->n_runs) {
                 xstart = src_2->runs[xrlepos].value;
                 xend = xstart + src_2->runs[xrlepos].length + 1;
             }
         } else {  // they overlap
             return true;
         }
     }
     return false;
 }


 /* Compute the difference of src_1 and src_2 and write the result to
  * dst. It is assumed that dst is distinct from both src_1 and src_2. */
 void run_container_andnot(const run_container_t *src_1,
                           const run_container_t *src_2, run_container_t *dst) {
     // following Java implementation as of June 2016

     if (dst->capacity < src_1->n_runs + src_2->n_runs)
         run_container_grow(dst, src_1->n_runs + src_2->n_runs, false);

     dst->n_runs = 0;

     int rlepos1 = 0;
     int rlepos2 = 0;
     int32_t start = src_1->runs[rlepos1].value;
     int32_t end = start + src_1->runs[rlepos1].length + 1;
     int32_t start2 = src_2->runs[rlepos2].value;
     int32_t end2 = start2 + src_2->runs[rlepos2].length + 1;

     while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) {
         if (end <= start2) {
             // output the first run
             dst->runs[dst->n_runs++] =
                 (rle16_t){.value = (uint16_t)start,
                           .length = (uint16_t)(end - start - 1)};
             rlepos1++;
             if (rlepos1 < src_1->n_runs) {
                 start = src_1->runs[rlepos1].value;
                 end = start + src_1->runs[rlepos1].length + 1;
             }
         } else if (end2 <= start) {
             // exit the second run
             rlepos2++;
             if (rlepos2 < src_2->n_runs) {
                 start2 = src_2->runs[rlepos2].value;
                 end2 = start2 + src_2->runs[rlepos2].length + 1;
             }
         } else {
             if (start < start2) {
                 dst->runs[dst->n_runs++] =
                     (rle16_t){.value = (uint16_t)start,
                               .length = (uint16_t)(start2 - start - 1)};
             }
             if (end2 < end) {
                 start = end2;
             } else {
                 rlepos1++;
                 if (rlepos1 < src_1->n_runs) {
                     start = src_1->runs[rlepos1].value;
                     end = start + src_1->runs[rlepos1].length + 1;
                 }
             }
         }
     }
     if (rlepos1 < src_1->n_runs) {
         dst->runs[dst->n_runs++] = (rle16_t){
             .value = (uint16_t)start, .length = (uint16_t)(end - start - 1)};
         rlepos1++;
         if (rlepos1 < src_1->n_runs) {
             memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1,
                    sizeof(rle16_t) * (src_1->n_runs - rlepos1));
             dst->n_runs += src_1->n_runs - rlepos1;
         }
     }
 }

 int run_container_to_uint32_array(void *vout, const run_container_t *cont,
                                   uint32_t base) {
     int outpos = 0;
     uint32_t *out = (uint32_t *)vout;
     for (int i = 0; i < cont->n_runs; ++i) {
         uint32_t run_start = base + cont->runs[i].value;
         uint16_t le = cont->runs[i].length;
         for (int j = 0; j <= le; ++j) {
             uint32_t val = run_start + j;
             memcpy(out + outpos, &val,
                    sizeof(uint32_t));  // should be compiled as a MOV on x64
             outpos++;
         }
     }
     return outpos;
 }

 /*
  * Print this container using printf (useful for debugging).
  */
 void run_container_printf(const run_container_t *cont) {
     for (int i = 0; i < cont->n_runs; ++i) {
         uint16_t run_start = cont->runs[i].value;
         uint16_t le = cont->runs[i].length;
         printf("[%d,%d]", run_start, run_start + le);
     }
 }

 /*
  * Print this container using printf as a comma-separated list of 32-bit
  * integers starting at base.
  */
 void run_container_printf_as_uint32_array(const run_container_t *cont,
                                           uint32_t base) {
     if (cont->n_runs == 0) return;
     {
         uint32_t run_start = base + cont->runs[0].value;
         uint16_t le = cont->runs[0].length;
         printf("%u", run_start);
         for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j);
     }
     for (int32_t i = 1; i < cont->n_runs; ++i) {
         uint32_t run_start = base + cont->runs[i].value;
         uint16_t le = cont->runs[i].length;
         for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j);
     }
 }

 int32_t run_container_serialize(const run_container_t *container, char *buf) {
     int32_t l, off;

     memcpy(buf, &container->n_runs, off = sizeof(container->n_runs));
     memcpy(&buf[off], &container->capacity, sizeof(container->capacity));
     off += sizeof(container->capacity);

     l = sizeof(rle16_t) * container->n_runs;
     memcpy(&buf[off], container->runs, l);
     return (off + l);
 }

 int32_t run_container_write(const run_container_t *container, char *buf) {
     memcpy(buf, &container->n_runs, sizeof(uint16_t));
     memcpy(buf + sizeof(uint16_t), container->runs,
            container->n_runs * sizeof(rle16_t));
     return run_container_size_in_bytes(container);
 }

 int32_t run_container_read(int32_t cardinality, run_container_t *container,
                            const char *buf) {
     (void)cardinality;
     memcpy(&container->n_runs, buf, sizeof(uint16_t));
     if (container->n_runs > container->capacity)
         run_container_grow(container, container->n_runs, false);
     if(container->n_runs > 0) {
       memcpy(container->runs, buf + sizeof(uint16_t),
            container->n_runs * sizeof(rle16_t));
     }
     return run_container_size_in_bytes(container);
 }

 uint32_t run_container_serialization_len(const run_container_t *container) {
     return (sizeof(container->n_runs) + sizeof(container->capacity) +
             sizeof(rle16_t) * container->n_runs);
 }

 void *run_container_deserialize(const char *buf, size_t buf_len) {
     run_container_t *ptr;

     if (buf_len < 8 /* n_runs + capacity */)
         return (NULL);
     else
         buf_len -= 8;

     if ((ptr = (run_container_t *)malloc(sizeof(run_container_t))) != NULL) {
         size_t len;
         int32_t off;

         memcpy(&ptr->n_runs, buf, off = 4);
         memcpy(&ptr->capacity, &buf[off], 4);
         off += 4;

         len = sizeof(rle16_t) * ptr->n_runs;

         if (len != buf_len) {
             free(ptr);
             return (NULL);
         }

         if ((ptr->runs = (rle16_t *)malloc(len)) == NULL) {
             free(ptr);
             return (NULL);
         }

         memcpy(ptr->runs, &buf[off], len);

         /* Check if returned values are monotonically increasing */
         for (int32_t i = 0, j = 0; i < ptr->n_runs; i++) {
             if (ptr->runs[i].value < j) {
                 free(ptr->runs);
                 free(ptr);
                 return (NULL);
             } else
                 j = ptr->runs[i].value;
         }
     }

     return (ptr);
 }

 bool run_container_iterate(const run_container_t *cont, uint32_t base,
                            roaring_iterator iterator, void *ptr) {
     for (int i = 0; i < cont->n_runs; ++i) {
         uint32_t run_start = base + cont->runs[i].value;
         uint16_t le = cont->runs[i].length;

         for (int j = 0; j <= le; ++j)
             if (!iterator(run_start + j, ptr)) return false;
     }
     return true;
 }

 bool run_container_iterate64(const run_container_t *cont, uint32_t base,
                              roaring_iterator64 iterator, uint64_t high_bits,
                              void *ptr) {
     for (int i = 0; i < cont->n_runs; ++i) {
         uint32_t run_start = base + cont->runs[i].value;
         uint16_t le = cont->runs[i].length;

         for (int j = 0; j <= le; ++j)
             if (!iterator(high_bits | (uint64_t)(run_start + j), ptr))
                 return false;
     }
     return true;
 }

 bool run_container_is_subset(const run_container_t *container1,
                              const run_container_t *container2) {
     int i1 = 0, i2 = 0;
     while (i1 < container1->n_runs && i2 < container2->n_runs) {
         int start1 = container1->runs[i1].value;
         int stop1 = start1 + container1->runs[i1].length;
         int start2 = container2->runs[i2].value;
         int stop2 = start2 + container2->runs[i2].length;
         if (start1 < start2) {
             return false;
         } else {  // start1 >= start2
             if (stop1 < stop2) {
                 i1++;
             } else if (stop1 == stop2) {
                 i1++;
                 i2++;
             } else {  // stop1 > stop2
                 i2++;
             }
         }
     }
     if (i1 == container1->n_runs) {
         return true;
     } else {
         return false;
     }
 }

 // TODO: write smart_append_exclusive version to match the overloaded 1 param
 // Java version (or  is it even used?)

 // follows the Java implementation closely
 // length is the rle-value.  Ie, run [10,12) uses a length value 1.
 void run_container_smart_append_exclusive(run_container_t *src,
                                           const uint16_t start,
                                           const uint16_t length) {
     int old_end;
     rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL;
     rle16_t *appended_last_run = src->runs + src->n_runs;

     if (!src->n_runs ||
         (start > (old_end = last_run->value + last_run->length + 1))) {
         *appended_last_run = (rle16_t){.value = start, .length = length};
         src->n_runs++;
         return;
     }
     if (old_end == start) {
         // we merge
         last_run->length += (length + 1);
         return;
     }
     int new_end = start + length + 1;

     if (start == last_run->value) {
         // wipe out previous
         if (new_end < old_end) {
             *last_run = (rle16_t){.value = (uint16_t)new_end,
                                   .length = (uint16_t)(old_end - new_end - 1)};
             return;
         } else if (new_end > old_end) {
             *last_run = (rle16_t){.value = (uint16_t)old_end,
                                   .length = (uint16_t)(new_end - old_end - 1)};
             return;
         } else {
             src->n_runs--;
             return;
         }
     }
     last_run->length = start - last_run->value - 1;
     if (new_end < old_end) {
         *appended_last_run =
             (rle16_t){.value = (uint16_t)new_end,
                       .length = (uint16_t)(old_end - new_end - 1)};
         src->n_runs++;
     } else if (new_end > old_end) {
         *appended_last_run =
             (rle16_t){.value = (uint16_t)old_end,
                       .length = (uint16_t)(new_end - old_end - 1)};
         src->n_runs++;
     }
 }

 bool run_container_select(const run_container_t *container,
                           uint32_t *start_rank, uint32_t rank,
                           uint32_t *element) {
     for (int i = 0; i < container->n_runs; i++) {
         uint16_t length = container->runs[i].length;
         if (rank <= *start_rank + length) {
             uint16_t value = container->runs[i].value;
             *element = value + rank - (*start_rank);
             return true;
         } else
             *start_rank += length + 1;
     }
     return false;
 }

 int run_container_rank(const run_container_t *container, uint16_t x) {
     int sum = 0;
     uint32_t x32 = x;
     for (int i = 0; i < container->n_runs; i++) {
         uint32_t startpoint = container->runs[i].value;
         uint32_t length = container->runs[i].length;
         uint32_t endpoint = length + startpoint;
         if (x <= endpoint) {
             if (x < startpoint) break;
             return sum + (x32 - startpoint) + 1;
         } else {
             sum += length + 1;
         }
     }
     return sum;
 }
 /* end file src/containers/run.c */
 /* begin file src/roaring.c */
 #include <assert.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <inttypes.h>

 static inline bool is_cow(const roaring_bitmap_t *r) {
     return r->high_low_container.flags & ROARING_FLAG_COW;
 }
 static inline bool is_frozen(const roaring_bitmap_t *r) {
     return r->high_low_container.flags & ROARING_FLAG_FROZEN;
 }

 // this is like roaring_bitmap_add, but it populates pointer arguments in such a
 // way
 // that we can recover the container touched, which, in turn can be used to
 // accelerate some functions (when you repeatedly need to add to the same
 // container)
 static inline void *containerptr_roaring_bitmap_add(roaring_bitmap_t *r,
                                                     uint32_t val,
                                                     uint8_t *typecode,
                                                     int *index) {
     uint16_t hb = val >> 16;
     const int i = ra_get_index(&r->high_low_container, hb);
     if (i >= 0) {
         ra_unshare_container_at_index(&r->high_low_container, i);
         void *container =
             ra_get_container_at_index(&r->high_low_container, i, typecode);
         uint8_t newtypecode = *typecode;
         void *container2 =
             container_add(container, val & 0xFFFF, *typecode, &newtypecode);
         *index = i;
         if (container2 != container) {
             container_free(container, *typecode);
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
             *typecode = newtypecode;
             return container2;
         } else {
             return container;
         }
     } else {
         array_container_t *newac = array_container_create();
         void *container = container_add(newac, val & 0xFFFF,
                                         ARRAY_CONTAINER_TYPE_CODE, typecode);
         // we could just assume that it stays an array container
         ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
                                    container, *typecode);
         *index = -i - 1;
         return container;
     }
 }

 roaring_bitmap_t *roaring_bitmap_create(void) {
     roaring_bitmap_t *ans =
         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
     if (!ans) {
         return NULL;
     }
     ra_init(&ans->high_low_container);
     return ans;
 }

 roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap) {
     roaring_bitmap_t *ans =
         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
     if (!ans) {
         return NULL;
     }
     bool is_ok = ra_init_with_capacity(&ans->high_low_container, cap);
     if (!is_ok) {
         free(ans);
         return NULL;
     }
     return ans;
 }

 void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
                              const uint32_t *vals) {
     void *container = NULL;  // hold value of last container touched
     uint8_t typecode = 0;    // typecode of last container touched
     uint32_t prev = 0;       // previous valued inserted
     size_t i = 0;            // index of value
     int containerindex = 0;
     if (n_args == 0) return;
     uint32_t val;
     memcpy(&val, vals + i, sizeof(val));
     container =
         containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
     prev = val;
     i++;
     for (; i < n_args; i++) {
         memcpy(&val, vals + i, sizeof(val));
         if (((prev ^ val) >> 16) ==
             0) {  // no need to seek the container, it is at hand
             // because we already have the container at hand, we can do the
             // insertion
             // automatically, bypassing the roaring_bitmap_add call
             uint8_t newtypecode = typecode;
             void *container2 =
                 container_add(container, val & 0xFFFF, typecode, &newtypecode);
             if (container2 != container) {  // rare instance when we need to
                                             // change the container type
                 container_free(container, typecode);
                 ra_set_container_at_index(&r->high_low_container,
                                           containerindex, container2,
                                           newtypecode);
                 typecode = newtypecode;
                 container = container2;
             }
         } else {
             container = containerptr_roaring_bitmap_add(r, val, &typecode,
                                                         &containerindex);
         }
         prev = val;
     }
 }

 roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals) {
     roaring_bitmap_t *answer = roaring_bitmap_create();
     roaring_bitmap_add_many(answer, n_args, vals);
     return answer;
 }

 roaring_bitmap_t *roaring_bitmap_of(size_t n_args, ...) {
     // todo: could be greatly optimized but we do not expect this call to ever
     // include long lists
     roaring_bitmap_t *answer = roaring_bitmap_create();
     va_list ap;
     va_start(ap, n_args);
     for (size_t i = 1; i <= n_args; i++) {
         uint32_t val = va_arg(ap, uint32_t);
         roaring_bitmap_add(answer, val);
     }
     va_end(ap);
     return answer;
 }

 static inline uint32_t minimum_uint32(uint32_t a, uint32_t b) {
     return (a < b) ? a : b;
 }

 static inline uint64_t minimum_uint64(uint64_t a, uint64_t b) {
     return (a < b) ? a : b;
 }

 roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max,
                                             uint32_t step) {
     if(max >= UINT64_C(0x100000000)) {
         max = UINT64_C(0x100000000);
     }
     if (step == 0) return NULL;
     if (max <= min) return NULL;
     roaring_bitmap_t *answer = roaring_bitmap_create();
     if (step >= (1 << 16)) {
         for (uint32_t value = (uint32_t)min; value < max; value += step) {
             roaring_bitmap_add(answer, value);
         }
         return answer;
     }
     uint64_t min_tmp = min;
     do {
         uint32_t key = (uint32_t)min_tmp >> 16;
         uint32_t container_min = min_tmp & 0xFFFF;
         uint32_t container_max = (uint32_t)minimum_uint64(max - (key << 16), 1 << 16);
         uint8_t type;
         void *container = container_from_range(&type, container_min,
                                                container_max, (uint16_t)step);
         ra_append(&answer->high_low_container, key, container, type);
         uint32_t gap = container_max - container_min + step - 1;
         min_tmp += gap - (gap % step);
     } while (min_tmp < max);
     // cardinality of bitmap will be ((uint64_t) max - min + step - 1 ) / step
     return answer;
 }

 void roaring_bitmap_add_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max) {
     if (min > max) {
         return;
     }

     uint32_t min_key = min >> 16;
     uint32_t max_key = max >> 16;

     int32_t num_required_containers = max_key - min_key + 1;
     int32_t suffix_length = count_greater(ra->high_low_container.keys,
                                           ra->high_low_container.size,
                                           max_key);
     int32_t prefix_length = count_less(ra->high_low_container.keys,
                                        ra->high_low_container.size - suffix_length,
                                        min_key);
     int32_t common_length = ra->high_low_container.size - prefix_length - suffix_length;

     if (num_required_containers > common_length) {
         ra_shift_tail(&ra->high_low_container, suffix_length,
                       num_required_containers - common_length);
     }

     int32_t src = prefix_length + common_length - 1;
     int32_t dst = ra->high_low_container.size - suffix_length - 1;
     for (uint32_t key = max_key; key != min_key-1; key--) { // beware of min_key==0
         uint32_t container_min = (min_key == key) ? (min & 0xffff) : 0;
         uint32_t container_max = (max_key == key) ? (max & 0xffff) : 0xffff;
         void* new_container;
         uint8_t new_type;

         if (src >= 0 && ra->high_low_container.keys[src] == key) {
             ra_unshare_container_at_index(&ra->high_low_container, src);
             new_container = container_add_range(ra->high_low_container.containers[src],
                                                 ra->high_low_container.typecodes[src],
                                                 container_min, container_max, &new_type);
             if (new_container != ra->high_low_container.containers[src]) {
                 container_free(ra->high_low_container.containers[src],
                                ra->high_low_container.typecodes[src]);
             }
             src--;
         } else {
             new_container = container_from_range(&new_type, container_min,
                                                  container_max+1, 1);
         }
         ra_replace_key_and_container_at_index(&ra->high_low_container, dst,
                                               key, new_container, new_type);
         dst--;
     }
 }

 void roaring_bitmap_remove_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max) {
     if (min > max) {
         return;
     }

     uint32_t min_key = min >> 16;
     uint32_t max_key = max >> 16;

     int32_t src = count_less(ra->high_low_container.keys, ra->high_low_container.size, min_key);
     int32_t dst = src;
     while (src < ra->high_low_container.size && ra->high_low_container.keys[src] <= max_key) {
         uint32_t container_min = (min_key == ra->high_low_container.keys[src]) ? (min & 0xffff) : 0;
         uint32_t container_max = (max_key == ra->high_low_container.keys[src]) ? (max & 0xffff) : 0xffff;
         ra_unshare_container_at_index(&ra->high_low_container, src);
         void *new_container;
         uint8_t new_type;
         new_container = container_remove_range(ra->high_low_container.containers[src],
                                                ra->high_low_container.typecodes[src],
                                                container_min, container_max,
                                                &new_type);
         if (new_container != ra->high_low_container.containers[src]) {
             container_free(ra->high_low_container.containers[src],
                            ra->high_low_container.typecodes[src]);
         }
         if (new_container) {
             ra_replace_key_and_container_at_index(&ra->high_low_container, dst,
                                                   ra->high_low_container.keys[src],
                                                   new_container, new_type);
             dst++;
         }
         src++;
     }
     if (src > dst) {
         ra_shift_tail(&ra->high_low_container, ra->high_low_container.size - src, dst - src);
     }
 }

 void roaring_bitmap_printf(const roaring_bitmap_t *ra) {
     printf("{");
     for (int i = 0; i < ra->high_low_container.size; ++i) {
         container_printf_as_uint32_array(
             ra->high_low_container.containers[i],
             ra->high_low_container.typecodes[i],
             ((uint32_t)ra->high_low_container.keys[i]) << 16);
         if (i + 1 < ra->high_low_container.size) printf(",");
     }
     printf("}");
 }

 void roaring_bitmap_printf_describe(const roaring_bitmap_t *ra) {
     printf("{");
     for (int i = 0; i < ra->high_low_container.size; ++i) {
         printf("%d: %s (%d)", ra->high_low_container.keys[i],
                get_full_container_name(ra->high_low_container.containers[i],
                                        ra->high_low_container.typecodes[i]),
                container_get_cardinality(ra->high_low_container.containers[i],
                                          ra->high_low_container.typecodes[i]));
         if (ra->high_low_container.typecodes[i] == SHARED_CONTAINER_TYPE_CODE) {
             printf(
                 "(shared count = %" PRIu32 " )",
                 ((shared_container_t *)(ra->high_low_container.containers[i]))
                     ->counter);
         }

         if (i + 1 < ra->high_low_container.size) printf(", ");
     }
     printf("}");
 }

 typedef struct min_max_sum_s {
     uint32_t min;
     uint32_t max;
     uint64_t sum;
 } min_max_sum_t;

 static bool min_max_sum_fnc(uint32_t value, void *param) {
     min_max_sum_t *mms = (min_max_sum_t *)param;
     if (value > mms->max) mms->max = value;
     if (value < mms->min) mms->min = value;
     mms->sum += value;
     return true;  // we always process all data points
 }

 /**
 *  (For advanced users.)
 * Collect statistics about the bitmap
 */
 void roaring_bitmap_statistics(const roaring_bitmap_t *ra,
                                roaring_statistics_t *stat) {
     memset(stat, 0, sizeof(*stat));
     stat->n_containers = ra->high_low_container.size;
     stat->cardinality = roaring_bitmap_get_cardinality(ra);
     min_max_sum_t mms;
     mms.min = UINT32_C(0xFFFFFFFF);
     mms.max = UINT32_C(0);
     mms.sum = 0;
     roaring_iterate(ra, &min_max_sum_fnc, &mms);
     stat->min_value = mms.min;
     stat->max_value = mms.max;
     stat->sum_value = mms.sum;

     for (int i = 0; i < ra->high_low_container.size; ++i) {
         uint8_t truetype =
             get_container_type(ra->high_low_container.containers[i],
                                ra->high_low_container.typecodes[i]);
         uint32_t card =
             container_get_cardinality(ra->high_low_container.containers[i],
                                       ra->high_low_container.typecodes[i]);
         uint32_t sbytes =
             container_size_in_bytes(ra->high_low_container.containers[i],
                                     ra->high_low_container.typecodes[i]);
         switch (truetype) {
             case BITSET_CONTAINER_TYPE_CODE:
                 stat->n_bitset_containers++;
                 stat->n_values_bitset_containers += card;
                 stat->n_bytes_bitset_containers += sbytes;
                 break;
             case ARRAY_CONTAINER_TYPE_CODE:
                 stat->n_array_containers++;
                 stat->n_values_array_containers += card;
                 stat->n_bytes_array_containers += sbytes;
                 break;
             case RUN_CONTAINER_TYPE_CODE:
                 stat->n_run_containers++;
                 stat->n_values_run_containers += card;
                 stat->n_bytes_run_containers += sbytes;
                 break;
             default:
                 assert(false);
                 __builtin_unreachable();
         }
     }
 }

 roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r) {
     roaring_bitmap_t *ans =
         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
     if (!ans) {
         return NULL;
     }
     bool is_ok = ra_copy(&r->high_low_container, &ans->high_low_container,
                          is_cow(r));
     if (!is_ok) {
         free(ans);
         return NULL;
     }
     roaring_bitmap_set_copy_on_write(ans, is_cow(r));
     return ans;
 }

 bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
                                      const roaring_bitmap_t *src) {
     return ra_overwrite(&src->high_low_container, &dest->high_low_container,
                         is_cow(src));
 }

 void roaring_bitmap_free(const roaring_bitmap_t *r) {
     if (!is_frozen(r)) {
       ra_clear((roaring_array_t*)&r->high_low_container);
     }
     free((roaring_bitmap_t*)r);
 }

 void roaring_bitmap_clear(roaring_bitmap_t *r) {
   ra_reset(&r->high_low_container);
 }

 void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t val) {
     const uint16_t hb = val >> 16;
     const int i = ra_get_index(&r->high_low_container, hb);
     uint8_t typecode;
     if (i >= 0) {
         ra_unshare_container_at_index(&r->high_low_container, i);
         void *container =
             ra_get_container_at_index(&r->high_low_container, i, &typecode);
         uint8_t newtypecode = typecode;
         void *container2 =
             container_add(container, val & 0xFFFF, typecode, &newtypecode);
         if (container2 != container) {
             container_free(container, typecode);
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
         }
     } else {
         array_container_t *newac = array_container_create();
         void *container = container_add(newac, val & 0xFFFF,
                                         ARRAY_CONTAINER_TYPE_CODE, &typecode);
         // we could just assume that it stays an array container
         ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
                                    container, typecode);
     }
 }

 bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t val) {
     const uint16_t hb = val >> 16;
     const int i = ra_get_index(&r->high_low_container, hb);
     uint8_t typecode;
     bool result = false;
     if (i >= 0) {
         ra_unshare_container_at_index(&r->high_low_container, i);
         void *container =
             ra_get_container_at_index(&r->high_low_container, i, &typecode);

         const int oldCardinality =
             container_get_cardinality(container, typecode);

         uint8_t newtypecode = typecode;
         void *container2 =
             container_add(container, val & 0xFFFF, typecode, &newtypecode);
         if (container2 != container) {
             container_free(container, typecode);
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
             result = true;
         } else {
             const int newCardinality =
                 container_get_cardinality(container, newtypecode);

             result = oldCardinality != newCardinality;
         }
     } else {
         array_container_t *newac = array_container_create();
         void *container = container_add(newac, val & 0xFFFF,
                                         ARRAY_CONTAINER_TYPE_CODE, &typecode);
         // we could just assume that it stays an array container
         ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
                                    container, typecode);
         result = true;
     }

     return result;
 }

 void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t val) {
     const uint16_t hb = val >> 16;
     const int i = ra_get_index(&r->high_low_container, hb);
     uint8_t typecode;
     if (i >= 0) {
         ra_unshare_container_at_index(&r->high_low_container, i);
         void *container =
             ra_get_container_at_index(&r->high_low_container, i, &typecode);
         uint8_t newtypecode = typecode;
         void *container2 =
             container_remove(container, val & 0xFFFF, typecode, &newtypecode);
         if (container2 != container) {
             container_free(container, typecode);
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
         }
         if (container_get_cardinality(container2, newtypecode) != 0) {
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
         } else {
             ra_remove_at_index_and_free(&r->high_low_container, i);
         }
     }
 }

 bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t val) {
     const uint16_t hb = val >> 16;
     const int i = ra_get_index(&r->high_low_container, hb);
     uint8_t typecode;
     bool result = false;
     if (i >= 0) {
         ra_unshare_container_at_index(&r->high_low_container, i);
         void *container =
             ra_get_container_at_index(&r->high_low_container, i, &typecode);

         const int oldCardinality =
             container_get_cardinality(container, typecode);

         uint8_t newtypecode = typecode;
         void *container2 =
             container_remove(container, val & 0xFFFF, typecode, &newtypecode);
         if (container2 != container) {
             container_free(container, typecode);
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
         }

         const int newCardinality =
             container_get_cardinality(container2, newtypecode);

         if (newCardinality != 0) {
             ra_set_container_at_index(&r->high_low_container, i, container2,
                                       newtypecode);
         } else {
             ra_remove_at_index_and_free(&r->high_low_container, i);
         }

         result = oldCardinality != newCardinality;
     }
     return result;
 }

 void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args,
                                 const uint32_t *vals) {
     if (n_args == 0 || r->high_low_container.size == 0) {
         return;
     }
     int32_t pos = -1; // position of the container used in the previous iteration
     for (size_t i = 0; i < n_args; i++) {
         uint16_t key = (uint16_t)(vals[i] >> 16);
         if (pos < 0 || key != r->high_low_container.keys[pos]) {
             pos = ra_get_index(&r->high_low_container, key);
         }
         if (pos >= 0) {
             uint8_t new_typecode;
             void *new_container;
             new_container = container_remove(r->high_low_container.containers[pos],
                                              vals[i] & 0xffff,
                                              r->high_low_container.typecodes[pos],
                                              &new_typecode);
             if (new_container != r->high_low_container.containers[pos]) {
                 container_free(r->high_low_container.containers[pos],
                                r->high_low_container.typecodes[pos]);
                 ra_replace_key_and_container_at_index(&r->high_low_container,
                                                       pos, key, new_container,
                                                       new_typecode);
             }
             if (!container_nonzero_cardinality(new_container, new_typecode)) {
                 container_free(new_container, new_typecode);
                 ra_remove_at_index(&r->high_low_container, pos);
                 pos = -1;
             }
         }
     }
 }

 // there should be some SIMD optimizations possible here
 roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *x1,
                                      const roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     uint32_t neededcap = length1 > length2 ? length2 : length1;
     roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));

     int pos1 = 0, pos2 = 0;

     while (pos1 < length1 && pos2 < length2) {
         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         if (s1 == s2) {
             uint8_t container_type_1, container_type_2;
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c = container_and(c1, container_type_1, c2, container_type_2,
                                     &container_result_type);
             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_append(&answer->high_low_container, s1, c,
                           container_result_type);
             } else {
                 container_free(
                     c, container_result_type);  // otherwise:memory leak!
             }
             ++pos1;
             ++pos2;
         } else if (s1 < s2) {  // s1 < s2
             pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
         } else {  // s1 > s2
             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
         }
     }
     return answer;
 }

 /**
  * Compute the union of 'number' bitmaps.
  */
 roaring_bitmap_t *roaring_bitmap_or_many(size_t number,
                                          const roaring_bitmap_t **x) {
     if (number == 0) {
         return roaring_bitmap_create();
     }
     if (number == 1) {
         return roaring_bitmap_copy(x[0]);
     }
     roaring_bitmap_t *answer =
         roaring_bitmap_lazy_or(x[0], x[1], LAZY_OR_BITSET_CONVERSION);
     for (size_t i = 2; i < number; i++) {
         roaring_bitmap_lazy_or_inplace(answer, x[i], LAZY_OR_BITSET_CONVERSION);
     }
     roaring_bitmap_repair_after_lazy(answer);
     return answer;
 }

 /**
  * Compute the xor of 'number' bitmaps.
  */
 roaring_bitmap_t *roaring_bitmap_xor_many(size_t number,
                                           const roaring_bitmap_t **x) {
     if (number == 0) {
         return roaring_bitmap_create();
     }
     if (number == 1) {
         return roaring_bitmap_copy(x[0]);
     }
     roaring_bitmap_t *answer = roaring_bitmap_lazy_xor(x[0], x[1]);
     for (size_t i = 2; i < number; i++) {
         roaring_bitmap_lazy_xor_inplace(answer, x[i]);
     }
     roaring_bitmap_repair_after_lazy(answer);
     return answer;
 }

 // inplace and (modifies its first argument).
 void roaring_bitmap_and_inplace(roaring_bitmap_t *x1,
                                 const roaring_bitmap_t *x2) {
     if (x1 == x2) return;
     int pos1 = 0, pos2 = 0, intersection_size = 0;
     const int length1 = ra_get_size(&x1->high_low_container);
     const int length2 = ra_get_size(&x2->high_low_container);

     // any skipped-over or newly emptied containers in x1
     // have to be freed.
     while (pos1 < length1 && pos2 < length2) {
         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         if (s1 == s2) {
             uint8_t typecode1, typecode2, typecode_result;
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &typecode1);
             c1 = get_writable_copy_if_shared(c1, &typecode1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &typecode2);
             void *c =
                 container_iand(c1, typecode1, c2, typecode2, &typecode_result);
             if (c != c1) {  // in this instance a new container was created, and
                             // we need to free the old one
                 container_free(c1, typecode1);
             }
             if (container_nonzero_cardinality(c, typecode_result)) {
                 ra_replace_key_and_container_at_index(&x1->high_low_container,
                                                       intersection_size, s1, c,
                                                       typecode_result);
                 intersection_size++;
             } else {
                 container_free(c, typecode_result);
             }
             ++pos1;
             ++pos2;
         } else if (s1 < s2) {
             pos1 = ra_advance_until_freeing(&x1->high_low_container, s2, pos1);
         } else {  // s1 > s2
             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
         }
     }

     // if we ended early because x2 ran out, then all remaining in x1 should be
     // freed
     while (pos1 < length1) {
         container_free(x1->high_low_container.containers[pos1],
                        x1->high_low_container.typecodes[pos1]);
         ++pos1;
     }

     // all containers after this have either been copied or freed
     ra_downsize(&x1->high_low_container, intersection_size);
 }

 roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *x1,
                                     const roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     if (0 == length1) {
         return roaring_bitmap_copy(x2);
     }
     if (0 == length2) {
         return roaring_bitmap_copy(x1);
     }
     roaring_bitmap_t *answer =
         roaring_bitmap_create_with_capacity(length1 + length2);
     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c = container_or(c1, container_type_1, c2, container_type_2,
                                    &container_result_type);
             // since we assume that the initial containers are non-empty, the
             // result here
             // can only be non-empty
             ra_append(&answer->high_low_container, s1, c,
                       container_result_type);
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             // c1 = container_clone(c1, container_type_1);
             c1 =
                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
             if (is_cow(x1)) {
                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
                                           container_type_1);
             }
             ra_append(&answer->high_low_container, s1, c1, container_type_1);
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             // c2 = container_clone(c2, container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }
             ra_append(&answer->high_low_container, s2, c2, container_type_2);
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&answer->high_low_container,
                              &x2->high_low_container, pos2, length2,
                              is_cow(x2));
     } else if (pos2 == length2) {
         ra_append_copy_range(&answer->high_low_container,
                              &x1->high_low_container, pos1, length1,
                              is_cow(x1));
     }
     return answer;
 }

 // inplace or (modifies its first argument).
 void roaring_bitmap_or_inplace(roaring_bitmap_t *x1,
                                const roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     int length1 = x1->high_low_container.size;
     const int length2 = x2->high_low_container.size;

     if (0 == length2) return;

     if (0 == length1) {
         roaring_bitmap_overwrite(x1, x2);
         return;
     }
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             if (!container_is_full(c1, container_type_1)) {
                 c1 = get_writable_copy_if_shared(c1, &container_type_1);

                 void *c2 = ra_get_container_at_index(&x2->high_low_container,
                                                      pos2, &container_type_2);
                 void *c =
                     container_ior(c1, container_type_1, c2, container_type_2,
                                   &container_result_type);
                 if (c !=
                     c1) {  // in this instance a new container was created, and
                            // we need to free the old one
                     container_free(c1, container_type_1);
                 }

                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
                                           container_result_type);
             }
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }

             // void *c2_clone = container_clone(c2, container_type_2);
             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
                                        container_type_2);
             pos1++;
             length1++;
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
                              pos2, length2, is_cow(x2));
     }
 }

 roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *x1,
                                      const roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     if (0 == length1) {
         return roaring_bitmap_copy(x2);
     }
     if (0 == length2) {
         return roaring_bitmap_copy(x1);
     }
     roaring_bitmap_t *answer =
         roaring_bitmap_create_with_capacity(length1 + length2);
     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c = container_xor(c1, container_type_1, c2, container_type_2,
                                     &container_result_type);

             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_append(&answer->high_low_container, s1, c,
                           container_result_type);
             } else {
                 container_free(c, container_result_type);
             }
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             c1 =
                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
             if (is_cow(x1)) {
                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
                                           container_type_1);
             }
             ra_append(&answer->high_low_container, s1, c1, container_type_1);
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }
             ra_append(&answer->high_low_container, s2, c2, container_type_2);
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&answer->high_low_container,
                              &x2->high_low_container, pos2, length2,
                              is_cow(x2));
     } else if (pos2 == length2) {
         ra_append_copy_range(&answer->high_low_container,
                              &x1->high_low_container, pos1, length1,
                              is_cow(x1));
     }
     return answer;
 }

 // inplace xor (modifies its first argument).

 void roaring_bitmap_xor_inplace(roaring_bitmap_t *x1,
                                 const roaring_bitmap_t *x2) {
     assert(x1 != x2);
     uint8_t container_result_type = 0;
     int length1 = x1->high_low_container.size;
     const int length2 = x2->high_low_container.size;

     if (0 == length2) return;

     if (0 == length1) {
         roaring_bitmap_overwrite(x1, x2);
         return;
     }

     // XOR can have new containers inserted from x2, but can also
     // lose containers when x1 and x2 are nonempty and identical.

     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             c1 = get_writable_copy_if_shared(c1, &container_type_1);

             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c = container_ixor(c1, container_type_1, c2, container_type_2,
                                      &container_result_type);

             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
                                           container_result_type);
                 ++pos1;
             } else {
                 container_free(c, container_result_type);
                 ra_remove_at_index(&x1->high_low_container, pos1);
                 --length1;
             }

             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }

             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
                                        container_type_2);
             pos1++;
             length1++;
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
                              pos2, length2, is_cow(x2));
     }
 }

 roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *x1,
                                         const roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     if (0 == length1) {
         roaring_bitmap_t *empty_bitmap = roaring_bitmap_create();
         roaring_bitmap_set_copy_on_write(empty_bitmap, is_cow(x1) && is_cow(x2));
         return empty_bitmap;
     }
     if (0 == length2) {
         return roaring_bitmap_copy(x1);
     }
     roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(length1);
     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));

     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = 0;
     uint16_t s2 = 0;
     while (true) {
         s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
         s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c =
                 container_andnot(c1, container_type_1, c2, container_type_2,
                                  &container_result_type);

             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_append(&answer->high_low_container, s1, c,
                           container_result_type);
             } else {
                 container_free(c, container_result_type);
             }
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
         } else if (s1 < s2) {  // s1 < s2
             const int next_pos1 =
                 ra_advance_until(&x1->high_low_container, s2, pos1);
             ra_append_copy_range(&answer->high_low_container,
                                  &x1->high_low_container, pos1, next_pos1,
                                  is_cow(x1));
             // TODO : perhaps some of the copy_on_write should be based on
             // answer rather than x1 (more stringent?).  Many similar cases
             pos1 = next_pos1;
             if (pos1 == length1) break;
         } else {  // s1 > s2
             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
             if (pos2 == length2) break;
         }
     }
     if (pos2 == length2) {
         ra_append_copy_range(&answer->high_low_container,
                              &x1->high_low_container, pos1, length1,
                              is_cow(x1));
     }
     return answer;
 }

 // inplace andnot (modifies its first argument).

 void roaring_bitmap_andnot_inplace(roaring_bitmap_t *x1,
                                    const roaring_bitmap_t *x2) {
     assert(x1 != x2);

     uint8_t container_result_type = 0;
     int length1 = x1->high_low_container.size;
     const int length2 = x2->high_low_container.size;
     int intersection_size = 0;

     if (0 == length2) return;

     if (0 == length1) {
         roaring_bitmap_clear(x1);
         return;
     }

     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             c1 = get_writable_copy_if_shared(c1, &container_type_1);

             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c =
                 container_iandnot(c1, container_type_1, c2, container_type_2,
                                   &container_result_type);

             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_replace_key_and_container_at_index(&x1->high_low_container,
                                                       intersection_size++, s1,
                                                       c, container_result_type);
             } else {
                 container_free(c, container_result_type);
             }

             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             if (pos1 != intersection_size) {
                 void *c1 = ra_get_container_at_index(&x1->high_low_container,
                                                      pos1, &container_type_1);

                 ra_replace_key_and_container_at_index(&x1->high_low_container,
                                                       intersection_size, s1, c1,
                                                       container_type_1);
             }
             intersection_size++;
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }

     if (pos1 < length1) {
         // all containers between intersection_size and
         // pos1 are junk.  However, they have either been moved
         // (thus still referenced) or involved in an iandnot
         // that will clean up all containers that could not be reused.
         // Thus we should not free the junk containers between
         // intersection_size and pos1.
         if (pos1 > intersection_size) {
             // left slide of remaining items
             ra_copy_range(&x1->high_low_container, pos1, length1,
                           intersection_size);
         }
         // else current placement is fine
         intersection_size += (length1 - pos1);
     }
     ra_downsize(&x1->high_low_container, intersection_size);
 }

 uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *ra) {
     uint64_t card = 0;
     for (int i = 0; i < ra->high_low_container.size; ++i)
         card += container_get_cardinality(ra->high_low_container.containers[i],
                                           ra->high_low_container.typecodes[i]);
     return card;
 }

 uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *ra,
                                           uint64_t range_start,
                                           uint64_t range_end) {
     if (range_end > UINT32_MAX) {
         range_end = UINT32_MAX + UINT64_C(1);
     }
     if (range_start >= range_end) {
         return 0;
     }
     range_end--; // make range_end inclusive
     // now we have: 0 <= range_start <= range_end <= UINT32_MAX

     uint16_t minhb = range_start >> 16;
     uint16_t maxhb = range_end >> 16;

     uint64_t card = 0;

     int i = ra_get_index(&ra->high_low_container, minhb);
     if (i >= 0) {
         if (minhb == maxhb) {
             card += container_rank(ra->high_low_container.containers[i],
                                    ra->high_low_container.typecodes[i],
                                    range_end & 0xffff);
         } else {
             card += container_get_cardinality(ra->high_low_container.containers[i],
                                               ra->high_low_container.typecodes[i]);
         }
         if ((range_start & 0xffff) != 0) {
             card -= container_rank(ra->high_low_container.containers[i],
                                    ra->high_low_container.typecodes[i],
                                    (range_start & 0xffff) - 1);
         }
         i++;
     } else {
         i = -i - 1;
     }

     for (; i < ra->high_low_container.size; i++) {
         uint16_t key = ra->high_low_container.keys[i];
         if (key < maxhb) {
             card += container_get_cardinality(ra->high_low_container.containers[i],
                                               ra->high_low_container.typecodes[i]);
         } else if (key == maxhb) {
             card += container_rank(ra->high_low_container.containers[i],
                                    ra->high_low_container.typecodes[i],
                                    range_end & 0xffff);
             break;
         } else {
             break;
         }
     }

     return card;
 }


 bool roaring_bitmap_is_empty(const roaring_bitmap_t *ra) {
     return ra->high_low_container.size == 0;
 }

 void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *ra, uint32_t *ans) {
     ra_to_uint32_array(&ra->high_low_container, ans);
 }

 bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *ra, size_t offset, size_t limit,  uint32_t *ans) {
     return ra_range_uint32_array(&ra->high_low_container, offset, limit, ans);
 }

 /** convert array and bitmap containers to run containers when it is more
  * efficient;
  * also convert from run containers when more space efficient.  Returns
  * true if the result has at least one run container.
 */
 bool roaring_bitmap_run_optimize(roaring_bitmap_t *r) {
     bool answer = false;
     for (int i = 0; i < r->high_low_container.size; i++) {
         uint8_t typecode_original, typecode_after;
         ra_unshare_container_at_index(
             &r->high_low_container, i);  // TODO: this introduces extra cloning!
         void *c = ra_get_container_at_index(&r->high_low_container, i,
                                             &typecode_original);
         void *c1 = convert_run_optimize(c, typecode_original, &typecode_after);
         if (typecode_after == RUN_CONTAINER_TYPE_CODE) answer = true;
         ra_set_container_at_index(&r->high_low_container, i, c1,
                                   typecode_after);
     }
     return answer;
 }

 size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r) {
     size_t answer = 0;
     for (int i = 0; i < r->high_low_container.size; i++) {
         uint8_t typecode_original;
         void *c = ra_get_container_at_index(&r->high_low_container, i,
                                             &typecode_original);
         answer += container_shrink_to_fit(c, typecode_original);
     }
     answer += ra_shrink_to_fit(&r->high_low_container);
     return answer;
 }

 /**
  *  Remove run-length encoding even when it is more space efficient
  *  return whether a change was applied
  */
 bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r) {
     bool answer = false;
     for (int i = 0; i < r->high_low_container.size; i++) {
         uint8_t typecode_original, typecode_after;
         void *c = ra_get_container_at_index(&r->high_low_container, i,
                                             &typecode_original);
         if (get_container_type(c, typecode_original) ==
             RUN_CONTAINER_TYPE_CODE) {
             answer = true;
             if (typecode_original == SHARED_CONTAINER_TYPE_CODE) {
                 run_container_t *truec =
                     (run_container_t *)((shared_container_t *)c)->container;
                 int32_t card = run_container_cardinality(truec);
                 void *c1 = convert_to_bitset_or_array_container(
                     truec, card, &typecode_after);
                 shared_container_free((shared_container_t *)c);// will free the run container as needed
                 ra_set_container_at_index(&r->high_low_container, i, c1,
                                           typecode_after);

             } else {
                 int32_t card = run_container_cardinality((run_container_t *)c);
                 void *c1 = convert_to_bitset_or_array_container(
                     (run_container_t *)c, card, &typecode_after);
                 run_container_free((run_container_t *)c);
                 ra_set_container_at_index(&r->high_low_container, i, c1,
                                           typecode_after);
             }
         }
     }
     return answer;
 }

 size_t roaring_bitmap_serialize(const roaring_bitmap_t *ra, char *buf) {
     size_t portablesize = roaring_bitmap_portable_size_in_bytes(ra);
     uint64_t cardinality = roaring_bitmap_get_cardinality(ra);
     uint64_t sizeasarray = cardinality * sizeof(uint32_t) + sizeof(uint32_t);
     if (portablesize < sizeasarray) {
         buf[0] = SERIALIZATION_CONTAINER;
         return roaring_bitmap_portable_serialize(ra, buf + 1) + 1;
     } else {
         buf[0] = SERIALIZATION_ARRAY_UINT32;
         memcpy(buf + 1, &cardinality, sizeof(uint32_t));
         roaring_bitmap_to_uint32_array(
             ra, (uint32_t *)(buf + 1 + sizeof(uint32_t)));
         return 1 + (size_t)sizeasarray;
     }
 }

 size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *ra) {
     size_t portablesize = roaring_bitmap_portable_size_in_bytes(ra);
     uint64_t sizeasarray = roaring_bitmap_get_cardinality(ra) * sizeof(uint32_t) +
                          sizeof(uint32_t);
     return portablesize < sizeasarray ? portablesize + 1 : (size_t)sizeasarray + 1;
 }

 size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *ra) {
     return ra_portable_size_in_bytes(&ra->high_low_container);
 }


 roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, size_t maxbytes) {
     roaring_bitmap_t *ans =
         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
     if (ans == NULL) {
         return NULL;
     }
     size_t bytesread;
     bool is_ok = ra_portable_deserialize(&ans->high_low_container, buf, maxbytes, &bytesread);
     if(is_ok) assert(bytesread <= maxbytes);
     roaring_bitmap_set_copy_on_write(ans, false);
     if (!is_ok) {
         free(ans);
         return NULL;
     }
     return ans;
 }

 roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf) {
     return roaring_bitmap_portable_deserialize_safe(buf, SIZE_MAX);
 }


 size_t roaring_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes) {
   return ra_portable_deserialize_size(buf, maxbytes);
 }


 size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *ra,
                                          char *buf) {
     return ra_portable_serialize(&ra->high_low_container, buf);
 }

 roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) {
     const char *bufaschar = (const char *)buf;
     if (*(const unsigned char *)buf == SERIALIZATION_ARRAY_UINT32) {
         /* This looks like a compressed set of uint32_t elements */
         uint32_t card;
         memcpy(&card, bufaschar + 1, sizeof(uint32_t));
         const uint32_t *elems =
             (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));

         return roaring_bitmap_of_ptr(card, elems);
     } else if (bufaschar[0] == SERIALIZATION_CONTAINER) {
         return roaring_bitmap_portable_deserialize(bufaschar + 1);
     } else
         return (NULL);
 }

 bool roaring_iterate(const roaring_bitmap_t *ra, roaring_iterator iterator,
                      void *ptr) {
     for (int i = 0; i < ra->high_low_container.size; ++i)
         if (!container_iterate(ra->high_low_container.containers[i],
                                ra->high_low_container.typecodes[i],
                                ((uint32_t)ra->high_low_container.keys[i]) << 16,
                                iterator, ptr)) {
             return false;
         }
     return true;
 }

 bool roaring_iterate64(const roaring_bitmap_t *ra, roaring_iterator64 iterator,
                        uint64_t high_bits, void *ptr) {
     for (int i = 0; i < ra->high_low_container.size; ++i)
         if (!container_iterate64(
                 ra->high_low_container.containers[i],
                 ra->high_low_container.typecodes[i],
                 ((uint32_t)ra->high_low_container.keys[i]) << 16, iterator,
                 high_bits, ptr)) {
             return false;
         }
     return true;
 }

 /****
 * begin roaring_uint32_iterator_t
 *****/

 // Partially initializes the roaring iterator when it begins looking at
 // a new container.
 static bool iter_new_container_partial_init(roaring_uint32_iterator_t *newit) {
     newit->in_container_index = 0;
     newit->run_index = 0;
     newit->current_value = 0;
     if (newit->container_index >= newit->parent->high_low_container.size ||
         newit->container_index < 0) {
         newit->current_value = UINT32_MAX;
         return (newit->has_value = false);
     }
     // assume not empty
     newit->has_value = true;
     // we precompute container, typecode and highbits so that successive
     // iterators do not have to grab them from odd memory locations
     // and have to worry about the (easily predicted) container_unwrap_shared
     // call.
     newit->container =
             newit->parent->high_low_container.containers[newit->container_index];
     newit->typecode =
             newit->parent->high_low_container.typecodes[newit->container_index];
     newit->highbits =
             ((uint32_t)
                     newit->parent->high_low_container.keys[newit->container_index])
                     << 16;
     newit->container =
             container_unwrap_shared(newit->container, &(newit->typecode));
     return newit->has_value;
 }

 static bool loadfirstvalue(roaring_uint32_iterator_t *newit) {
     if (!iter_new_container_partial_init(newit))
         return newit->has_value;

     uint32_t wordindex;
     uint64_t word;  // used for bitsets
     switch (newit->typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             wordindex = 0;
             while ((word = ((const bitset_container_t *)(newit->container))
                                ->array[wordindex]) == 0)
                 wordindex++;  // advance
             // here "word" is non-zero
             newit->in_container_index = wordindex * 64 + __builtin_ctzll(word);
             newit->current_value = newit->highbits | newit->in_container_index;
             break;
         case ARRAY_CONTAINER_TYPE_CODE:
             newit->current_value =
                 newit->highbits |
                 ((const array_container_t *)(newit->container))->array[0];
             break;
         case RUN_CONTAINER_TYPE_CODE:
             newit->current_value =
                 newit->highbits |
                 (((const run_container_t *)(newit->container))->runs[0].value);
             break;
         default:
             // if this ever happens, bug!
             assert(false);
     }  // switch (typecode)
     return true;
 }

 static bool loadlastvalue(roaring_uint32_iterator_t* newit) {
     if (!iter_new_container_partial_init(newit))
         return newit->has_value;

     switch(newit->typecode) {
         case BITSET_CONTAINER_TYPE_CODE: {
             uint32_t wordindex = BITSET_CONTAINER_SIZE_IN_WORDS - 1;
             uint64_t word;
             const bitset_container_t* bitset_container = (const bitset_container_t*)newit->container;
             while ((word = bitset_container->array[wordindex]) == 0)
                 --wordindex;

             int num_leading_zeros = __builtin_clzll(word);
             newit->in_container_index = (wordindex * 64) + (63 - num_leading_zeros);
             newit->current_value = newit->highbits | newit->in_container_index;
             break;
         }
         case ARRAY_CONTAINER_TYPE_CODE: {
             const array_container_t* array_container = (const array_container_t*)newit->container;
             newit->in_container_index = array_container->cardinality - 1;
             newit->current_value = newit->highbits | array_container->array[newit->in_container_index];
             break;
         }
         case RUN_CONTAINER_TYPE_CODE: {
             const run_container_t* run_container = (const run_container_t*)newit->container;
             newit->run_index = run_container->n_runs - 1;
             const rle16_t* last_run = &run_container->runs[newit->run_index];
             newit->current_value = newit->highbits | (last_run->value + last_run->length);
             break;
         }
         default:
             // if this ever happens, bug!
             assert(false);
     }
     return true;
 }

 // prerequesite: the value should be in range of the container
 static bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32_t val) {
     // Don't have to check return value because of prerequisite
     iter_new_container_partial_init(newit);
     uint16_t lb = val & 0xFFFF;

     switch (newit->typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             newit->in_container_index =  bitset_container_index_equalorlarger((const bitset_container_t *)(newit->container), lb);
             newit->current_value = newit->highbits | newit->in_container_index;
             break;
         case ARRAY_CONTAINER_TYPE_CODE:
             newit->in_container_index = array_container_index_equalorlarger((const array_container_t *)(newit->container), lb);
             newit->current_value =
                 newit->highbits |
                 ((const array_container_t *)(newit->container))->array[newit->in_container_index];
             break;
         case RUN_CONTAINER_TYPE_CODE:
             newit->run_index = run_container_index_equalorlarger((const run_container_t *)(newit->container), lb);
             if(((const run_container_t *)(newit->container))->runs[newit->run_index].value <= lb) {
               newit->current_value = val;
             } else {
               newit->current_value =
                 newit->highbits |
                 (((const run_container_t *)(newit->container))->runs[newit->run_index].value);
             }
             break;
         default:
             // if this ever happens, bug!
             assert(false);
     }  // switch (typecode)
     return true;
 }

 void roaring_init_iterator(const roaring_bitmap_t *ra,
                            roaring_uint32_iterator_t *newit) {
     newit->parent = ra;
     newit->container_index = 0;
     newit->has_value = loadfirstvalue(newit);
 }

 void roaring_init_iterator_last(const roaring_bitmap_t *ra,
                                 roaring_uint32_iterator_t *newit) {
     newit->parent = ra;
     newit->container_index = newit->parent->high_low_container.size - 1;
     newit->has_value = loadlastvalue(newit);
 }

 roaring_uint32_iterator_t *roaring_create_iterator(const roaring_bitmap_t *ra) {
     roaring_uint32_iterator_t *newit =
         (roaring_uint32_iterator_t *)malloc(sizeof(roaring_uint32_iterator_t));
     if (newit == NULL) return NULL;
     roaring_init_iterator(ra, newit);
     return newit;
 }

 roaring_uint32_iterator_t *roaring_copy_uint32_iterator(
     const roaring_uint32_iterator_t *it) {
     roaring_uint32_iterator_t *newit =
         (roaring_uint32_iterator_t *)malloc(sizeof(roaring_uint32_iterator_t));
     memcpy(newit, it, sizeof(roaring_uint32_iterator_t));
     return newit;
 }

 bool roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it, uint32_t val) {
     uint16_t hb = val >> 16;
     const int i = ra_get_index(& it->parent->high_low_container, hb);
     if (i >= 0) {
       uint32_t lowvalue = container_maximum(it->parent->high_low_container.containers[i], it->parent->high_low_container.typecodes[i]);
       uint16_t lb = val & 0xFFFF;
       if(lowvalue < lb ) {
         it->container_index = i+1; // will have to load first value of next container
       } else {// the value is necessarily within the range of the container
         it->container_index = i;
         it->has_value = loadfirstvalue_largeorequal(it, val);
         return it->has_value;
       }
     } else {
       // there is no matching, so we are going for the next container
       it->container_index = -i-1;
     }
     it->has_value = loadfirstvalue(it);
     return it->has_value;
 }


 bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it) {
     if (it->container_index >= it->parent->high_low_container.size) {
         return (it->has_value = false);
     }
     if (it->container_index < 0) {
         it->container_index = 0;
         return (it->has_value = loadfirstvalue(it));
     }

     uint32_t wordindex;  // used for bitsets
     uint64_t word;       // used for bitsets
     switch (it->typecode) {
         case BITSET_CONTAINER_TYPE_CODE:
             it->in_container_index++;
             wordindex = it->in_container_index / 64;
             if (wordindex >= BITSET_CONTAINER_SIZE_IN_WORDS) break;
             word = ((const bitset_container_t *)(it->container))
                        ->array[wordindex] &
                    (UINT64_MAX << (it->in_container_index % 64));
             // next part could be optimized/simplified
             while ((word == 0) &&
                    (wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS)) {
                 wordindex++;
                 word = ((const bitset_container_t *)(it->container))
                            ->array[wordindex];
             }
             if (word != 0) {
                 it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
                 it->current_value = it->highbits | it->in_container_index;
                 return (it->has_value = true);
             }
             break;
         case ARRAY_CONTAINER_TYPE_CODE:
             it->in_container_index++;
             if (it->in_container_index <
                 ((const array_container_t *)(it->container))->cardinality) {
                 it->current_value = it->highbits |
                                     ((const array_container_t *)(it->container))
                                         ->array[it->in_container_index];
                 return (it->has_value = true);
             }
             break;
         case RUN_CONTAINER_TYPE_CODE: {
             if(it->current_value == UINT32_MAX) {
                 return (it->has_value = false); // without this, we risk an overflow to zero
             }

             const run_container_t* run_container = (const run_container_t*)it->container;
             if (++it->current_value <= (it->highbits | (run_container->runs[it->run_index].value +
                                                         run_container->runs[it->run_index].length))) {
                 return (it->has_value = true);
             }

             if (++it->run_index < run_container->n_runs) {
                 // Assume the run has a value
                 it->current_value = it->highbits | run_container->runs[it->run_index].value;
                 return (it->has_value = true);
             }
             break;
         }
         default:
             // if this ever happens, bug!
             assert(false);
     }  // switch (typecode)
     // moving to next container
     it->container_index++;
     return (it->has_value = loadfirstvalue(it));
 }

 bool roaring_previous_uint32_iterator(roaring_uint32_iterator_t *it) {
     if (it->container_index < 0) {
         return (it->has_value = false);
     }
     if (it->container_index >= it->parent->high_low_container.size) {
         it->container_index = it->parent->high_low_container.size - 1;
         return (it->has_value = loadlastvalue(it));
     }

     switch (it->typecode) {
         case BITSET_CONTAINER_TYPE_CODE: {
             if (--it->in_container_index < 0)
                 break;

             const bitset_container_t* bitset_container = (const bitset_container_t*)it->container;
             int32_t wordindex = it->in_container_index / 64;
             uint64_t word = bitset_container->array[wordindex] & (UINT64_MAX >> (63 - (it->in_container_index % 64)));

             while (word == 0 && --wordindex >= 0) {
                 word = bitset_container->array[wordindex];
             }
             if (word == 0)
                 break;

             int num_leading_zeros = __builtin_clzll(word);
             it->in_container_index = (wordindex * 64) + (63 - num_leading_zeros);
             it->current_value = it->highbits | it->in_container_index;
             return (it->has_value = true);
         }
         case ARRAY_CONTAINER_TYPE_CODE: {
             if (--it->in_container_index < 0)
                 break;

             const array_container_t* array_container = (const array_container_t*)it->container;
             it->current_value = it->highbits | array_container->array[it->in_container_index];
             return (it->has_value = true);
         }
         case RUN_CONTAINER_TYPE_CODE: {
             if(it->current_value == 0)
                 return (it->has_value = false);

             const run_container_t* run_container = (const run_container_t*)it->container;
             if (--it->current_value >= (it->highbits | run_container->runs[it->run_index].value)) {
                 return (it->has_value = true);
             }

             if (--it->run_index < 0)
                 break;

             it->current_value = it->highbits | (run_container->runs[it->run_index].value +
                                                 run_container->runs[it->run_index].length);
             return (it->has_value = true);
         }
         default:
             // if this ever happens, bug!
             assert(false);
     }  // switch (typecode)

     // moving to previous container
     it->container_index--;
     return (it->has_value = loadlastvalue(it));
 }

 uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* buf, uint32_t count) {
   uint32_t ret = 0;
   uint32_t num_values;
   uint32_t wordindex;  // used for bitsets
   uint64_t word;       // used for bitsets
   const array_container_t* acont; //TODO remove
   const run_container_t* rcont; //TODO remove
   const bitset_container_t* bcont; //TODO remove

   while (it->has_value && ret < count) {
     switch (it->typecode) {
       case BITSET_CONTAINER_TYPE_CODE:
         bcont = (const bitset_container_t*)(it->container);
         wordindex = it->in_container_index / 64;
         word = bcont->array[wordindex] & (UINT64_MAX << (it->in_container_index % 64));
         do {
           while (word != 0 && ret < count) {
             buf[0] = it->highbits | (wordindex * 64 + __builtin_ctzll(word));
             word = word & (word - 1);
             buf++;
             ret++;
           }
           while (word == 0 && wordindex+1 < BITSET_CONTAINER_SIZE_IN_WORDS) {
             wordindex++;
             word = bcont->array[wordindex];
           }
         } while (word != 0 && ret < count);
         it->has_value = (word != 0);
         if (it->has_value) {
           it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
           it->current_value = it->highbits | it->in_container_index;
         }
         break;
       case ARRAY_CONTAINER_TYPE_CODE:
         acont = (const array_container_t *)(it->container);
         num_values = minimum_uint32(acont->cardinality - it->in_container_index, count - ret);
         for (uint32_t i = 0; i < num_values; i++) {
           buf[i] = it->highbits | acont->array[it->in_container_index + i];
         }
         buf += num_values;
         ret += num_values;
         it->in_container_index += num_values;
         it->has_value = (it->in_container_index < acont->cardinality);
         if (it->has_value) {
           it->current_value = it->highbits | acont->array[it->in_container_index];
         }
         break;
       case RUN_CONTAINER_TYPE_CODE:
         rcont = (const run_container_t*)(it->container);
         //"in_run_index" name is misleading, read it as "max_value_in_current_run"
         do {
           uint32_t largest_run_value = it->highbits | (rcont->runs[it->run_index].value + rcont->runs[it->run_index].length);
           num_values = minimum_uint32(largest_run_value - it->current_value + 1, count - ret);
           for (uint32_t i = 0; i < num_values; i++) {
             buf[i] = it->current_value + i;
           }
           it->current_value += num_values; // this can overflow to zero: UINT32_MAX+1=0
           buf += num_values;
           ret += num_values;

           if (it->current_value > largest_run_value || it->current_value == 0) {
             it->run_index++;
             if (it->run_index < rcont->n_runs) {
               it->current_value = it->highbits | rcont->runs[it->run_index].value;
             } else {
               it->has_value = false;
             }
           }
         } while ((ret < count) && it->has_value);
         break;
       default:
         assert(false);
     }
     if (it->has_value) {
       assert(ret == count);
       return ret;
     }
     it->container_index++;
     it->has_value = loadfirstvalue(it);
   }
   return ret;
 }


 void roaring_free_uint32_iterator(roaring_uint32_iterator_t *it) { free(it); }

 /****
 * end of roaring_uint32_iterator_t
 *****/

 bool roaring_bitmap_equals(const roaring_bitmap_t *ra1,
                            const roaring_bitmap_t *ra2) {
     if (ra1->high_low_container.size != ra2->high_low_container.size) {
         return false;
     }
     for (int i = 0; i < ra1->high_low_container.size; ++i) {
         if (ra1->high_low_container.keys[i] !=
             ra2->high_low_container.keys[i]) {
             return false;
         }
     }
     for (int i = 0; i < ra1->high_low_container.size; ++i) {
         bool areequal = container_equals(ra1->high_low_container.containers[i],
                                          ra1->high_low_container.typecodes[i],
                                          ra2->high_low_container.containers[i],
                                          ra2->high_low_container.typecodes[i]);
         if (!areequal) {
             return false;
         }
     }
     return true;
 }

 bool roaring_bitmap_is_subset(const roaring_bitmap_t *ra1,
                               const roaring_bitmap_t *ra2) {
     const int length1 = ra1->high_low_container.size,
               length2 = ra2->high_low_container.size;

     int pos1 = 0, pos2 = 0;

     while (pos1 < length1 && pos2 < length2) {
         const uint16_t s1 = ra_get_key_at_index(&ra1->high_low_container, pos1);
         const uint16_t s2 = ra_get_key_at_index(&ra2->high_low_container, pos2);

         if (s1 == s2) {
             uint8_t container_type_1, container_type_2;
             void *c1 = ra_get_container_at_index(&ra1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&ra2->high_low_container, pos2,
                                                  &container_type_2);
             bool subset =
                 container_is_subset(c1, container_type_1, c2, container_type_2);
             if (!subset) return false;
             ++pos1;
             ++pos2;
         } else if (s1 < s2) {  // s1 < s2
             return false;
         } else {  // s1 > s2
             pos2 = ra_advance_until(&ra2->high_low_container, s1, pos2);
         }
     }
     if (pos1 == length1)
         return true;
     else
         return false;
 }

 static void insert_flipped_container(roaring_array_t *ans_arr,
                                      const roaring_array_t *x1_arr, uint16_t hb,
                                      uint16_t lb_start, uint16_t lb_end) {
     const int i = ra_get_index(x1_arr, hb);
     const int j = ra_get_index(ans_arr, hb);
     uint8_t ctype_in, ctype_out;
     void *flipped_container = NULL;
     if (i >= 0) {
         void *container_to_flip =
             ra_get_container_at_index(x1_arr, i, &ctype_in);
         flipped_container =
             container_not_range(container_to_flip, ctype_in, (uint32_t)lb_start,
                                 (uint32_t)(lb_end + 1), &ctype_out);

         if (container_get_cardinality(flipped_container, ctype_out))
             ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
                                        ctype_out);
         else {
             container_free(flipped_container, ctype_out);
         }
     } else {
         flipped_container = container_range_of_ones(
             (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
         ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
                                    ctype_out);
     }
 }

 static void inplace_flip_container(roaring_array_t *x1_arr, uint16_t hb,
                                    uint16_t lb_start, uint16_t lb_end) {
     const int i = ra_get_index(x1_arr, hb);
     uint8_t ctype_in, ctype_out;
     void *flipped_container = NULL;
     if (i >= 0) {
         void *container_to_flip =
             ra_get_container_at_index(x1_arr, i, &ctype_in);
         flipped_container = container_inot_range(
             container_to_flip, ctype_in, (uint32_t)lb_start,
             (uint32_t)(lb_end + 1), &ctype_out);
         // if a new container was created, the old one was already freed
         if (container_get_cardinality(flipped_container, ctype_out)) {
             ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
         } else {
             container_free(flipped_container, ctype_out);
             ra_remove_at_index(x1_arr, i);
         }

     } else {
         flipped_container = container_range_of_ones(
             (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
         ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
                                    ctype_out);
     }
 }

 static void insert_fully_flipped_container(roaring_array_t *ans_arr,
                                            const roaring_array_t *x1_arr,
                                            uint16_t hb) {
     const int i = ra_get_index(x1_arr, hb);
     const int j = ra_get_index(ans_arr, hb);
     uint8_t ctype_in, ctype_out;
     void *flipped_container = NULL;
     if (i >= 0) {
         void *container_to_flip =
             ra_get_container_at_index(x1_arr, i, &ctype_in);
         flipped_container =
             container_not(container_to_flip, ctype_in, &ctype_out);
         if (container_get_cardinality(flipped_container, ctype_out))
             ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
                                        ctype_out);
         else {
             container_free(flipped_container, ctype_out);
         }
     } else {
         flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
         ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
                                    ctype_out);
     }
 }

 static void inplace_fully_flip_container(roaring_array_t *x1_arr, uint16_t hb) {
     const int i = ra_get_index(x1_arr, hb);
     uint8_t ctype_in, ctype_out;
     void *flipped_container = NULL;
     if (i >= 0) {
         void *container_to_flip =
             ra_get_container_at_index(x1_arr, i, &ctype_in);
         flipped_container =
             container_inot(container_to_flip, ctype_in, &ctype_out);

         if (container_get_cardinality(flipped_container, ctype_out)) {
             ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
         } else {
             container_free(flipped_container, ctype_out);
             ra_remove_at_index(x1_arr, i);
         }

     } else {
         flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
         ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
                                    ctype_out);
     }
 }

 roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *x1,
                                       uint64_t range_start,
                                       uint64_t range_end) {
     if (range_start >= range_end) {
         return roaring_bitmap_copy(x1);
     }
     if(range_end >= UINT64_C(0x100000000)) {
         range_end = UINT64_C(0x100000000);
     }

     roaring_bitmap_t *ans = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(ans, is_cow(x1));

     uint16_t hb_start = (uint16_t)(range_start >> 16);
     const uint16_t lb_start = (uint16_t)range_start;  // & 0xFFFF;
     uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
     const uint16_t lb_end = (uint16_t)(range_end - 1);  // & 0xFFFF;

     ra_append_copies_until(&ans->high_low_container, &x1->high_low_container,
                            hb_start, is_cow(x1));
     if (hb_start == hb_end) {
         insert_flipped_container(&ans->high_low_container,
                                  &x1->high_low_container, hb_start, lb_start,
                                  lb_end);
     } else {
         // start and end containers are distinct
         if (lb_start > 0) {
             // handle first (partial) container
             insert_flipped_container(&ans->high_low_container,
                                      &x1->high_low_container, hb_start,
                                      lb_start, 0xFFFF);
             ++hb_start;  // for the full containers.  Can't wrap.
         }

         if (lb_end != 0xFFFF) --hb_end;  // later we'll handle the partial block

         for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
             insert_fully_flipped_container(&ans->high_low_container,
                                            &x1->high_low_container, hb);
         }

         // handle a partial final container
         if (lb_end != 0xFFFF) {
             insert_flipped_container(&ans->high_low_container,
                                      &x1->high_low_container, hb_end + 1, 0,
                                      lb_end);
             ++hb_end;
         }
     }
     ra_append_copies_after(&ans->high_low_container, &x1->high_low_container,
                            hb_end, is_cow(x1));
     return ans;
 }

 void roaring_bitmap_flip_inplace(roaring_bitmap_t *x1, uint64_t range_start,
                                  uint64_t range_end) {
     if (range_start >= range_end) {
         return;  // empty range
     }
     if(range_end >= UINT64_C(0x100000000)) {
         range_end = UINT64_C(0x100000000);
     }

     uint16_t hb_start = (uint16_t)(range_start >> 16);
     const uint16_t lb_start = (uint16_t)range_start;
     uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
     const uint16_t lb_end = (uint16_t)(range_end - 1);

     if (hb_start == hb_end) {
         inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
                                lb_end);
     } else {
         // start and end containers are distinct
         if (lb_start > 0) {
             // handle first (partial) container
             inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
                                    0xFFFF);
             ++hb_start;  // for the full containers.  Can't wrap.
         }

         if (lb_end != 0xFFFF) --hb_end;

         for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
             inplace_fully_flip_container(&x1->high_low_container, hb);
         }
         // handle a partial final container
         if (lb_end != 0xFFFF) {
             inplace_flip_container(&x1->high_low_container, hb_end + 1, 0,
                                    lb_end);
             ++hb_end;
         }
     }
 }

 roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *x1,
                                          const roaring_bitmap_t *x2,
                                          const bool bitsetconversion) {
     uint8_t container_result_type = 0;
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     if (0 == length1) {
         return roaring_bitmap_copy(x2);
     }
     if (0 == length2) {
         return roaring_bitmap_copy(x1);
     }
     roaring_bitmap_t *answer =
         roaring_bitmap_create_with_capacity(length1 + length2);
     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c;
             if (bitsetconversion && (get_container_type(c1, container_type_1) !=
                                      BITSET_CONTAINER_TYPE_CODE) &&
                 (get_container_type(c2, container_type_2) !=
                  BITSET_CONTAINER_TYPE_CODE)) {
                 void *newc1 =
                     container_mutable_unwrap_shared(c1, &container_type_1);
                 newc1 = container_to_bitset(newc1, container_type_1);
                 container_type_1 = BITSET_CONTAINER_TYPE_CODE;
                 c = container_lazy_ior(newc1, container_type_1, c2,
                                        container_type_2,
                                        &container_result_type);
                 if (c != newc1) {  // should not happen
                     container_free(newc1, container_type_1);
                 }
             } else {
                 c = container_lazy_or(c1, container_type_1, c2,
                                       container_type_2, &container_result_type);
             }
             // since we assume that the initial containers are non-empty,
             // the
             // result here
             // can only be non-empty
             ra_append(&answer->high_low_container, s1, c,
                       container_result_type);
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             c1 =
                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
             if (is_cow(x1)) {
                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
                                           container_type_1);
             }
             ra_append(&answer->high_low_container, s1, c1, container_type_1);
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }
             ra_append(&answer->high_low_container, s2, c2, container_type_2);
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&answer->high_low_container,
                              &x2->high_low_container, pos2, length2,
                              is_cow(x2));
     } else if (pos2 == length2) {
         ra_append_copy_range(&answer->high_low_container,
                              &x1->high_low_container, pos1, length1,
                              is_cow(x1));
     }
     return answer;
 }

 void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *x1,
                                     const roaring_bitmap_t *x2,
                                     const bool bitsetconversion) {
     uint8_t container_result_type = 0;
     int length1 = x1->high_low_container.size;
     const int length2 = x2->high_low_container.size;

     if (0 == length2) return;

     if (0 == length1) {
         roaring_bitmap_overwrite(x1, x2);
         return;
     }
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             if (!container_is_full(c1, container_type_1)) {
                 if ((bitsetconversion == false) ||
                     (get_container_type(c1, container_type_1) ==
                      BITSET_CONTAINER_TYPE_CODE)) {
                     c1 = get_writable_copy_if_shared(c1, &container_type_1);
                 } else {
                     // convert to bitset
                     void *oldc1 = c1;
                     uint8_t oldt1 = container_type_1;
                     c1 = container_mutable_unwrap_shared(c1, &container_type_1);
                     c1 = container_to_bitset(c1, container_type_1);
                     container_free(oldc1, oldt1);
                     container_type_1 = BITSET_CONTAINER_TYPE_CODE;
                 }

                 void *c2 = ra_get_container_at_index(&x2->high_low_container,
                                                      pos2, &container_type_2);
                 void *c = container_lazy_ior(c1, container_type_1, c2,
                                              container_type_2,
                                              &container_result_type);
                 if (c !=
                     c1) {  // in this instance a new container was created, and
                            // we need to free the old one
                     container_free(c1, container_type_1);
                 }

                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
                                           container_result_type);
             }
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             // void *c2_clone = container_clone(c2, container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }
             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
                                        container_type_2);
             pos1++;
             length1++;
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
                              pos2, length2, is_cow(x2));
     }
 }

 roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *x1,
                                           const roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     if (0 == length1) {
         return roaring_bitmap_copy(x2);
     }
     if (0 == length2) {
         return roaring_bitmap_copy(x1);
     }
     roaring_bitmap_t *answer =
         roaring_bitmap_create_with_capacity(length1 + length2);
     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c =
                 container_lazy_xor(c1, container_type_1, c2, container_type_2,
                                    &container_result_type);

             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_append(&answer->high_low_container, s1, c,
                           container_result_type);
             } else {
                 container_free(c, container_result_type);
             }

             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             c1 =
                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
             if (is_cow(x1)) {
                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
                                           container_type_1);
             }
             ra_append(&answer->high_low_container, s1, c1, container_type_1);
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }
             ra_append(&answer->high_low_container, s2, c2, container_type_2);
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&answer->high_low_container,
                              &x2->high_low_container, pos2, length2,
                              is_cow(x2));
     } else if (pos2 == length2) {
         ra_append_copy_range(&answer->high_low_container,
                              &x1->high_low_container, pos1, length1,
                              is_cow(x1));
     }
     return answer;
 }

 void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *x1,
                                      const roaring_bitmap_t *x2) {
     assert(x1 != x2);
     uint8_t container_result_type = 0;
     int length1 = x1->high_low_container.size;
     const int length2 = x2->high_low_container.size;

     if (0 == length2) return;

     if (0 == length1) {
         roaring_bitmap_overwrite(x1, x2);
         return;
     }
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             c1 = get_writable_copy_if_shared(c1, &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             void *c =
                 container_lazy_ixor(c1, container_type_1, c2, container_type_2,
                                     &container_result_type);
             if (container_nonzero_cardinality(c, container_result_type)) {
                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
                                           container_result_type);
                 ++pos1;
             } else {
                 container_free(c, container_result_type);
                 ra_remove_at_index(&x1->high_low_container, pos1);
                 --length1;
             }
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             // void *c2_clone = container_clone(c2, container_type_2);
             c2 =
                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
             if (is_cow(x2)) {
                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
                                           container_type_2);
             }
             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
                                        container_type_2);
             pos1++;
             length1++;
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
                              pos2, length2, is_cow(x2));
     }
 }

 void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *ra) {
     for (int i = 0; i < ra->high_low_container.size; ++i) {
         const uint8_t original_typecode = ra->high_low_container.typecodes[i];
         void *container = ra->high_low_container.containers[i];
         uint8_t new_typecode = original_typecode;
         void *newcontainer =
             container_repair_after_lazy(container, &new_typecode);
         ra->high_low_container.containers[i] = newcontainer;
         ra->high_low_container.typecodes[i] = new_typecode;
     }
 }


 /**
 * roaring_bitmap_rank returns the number of integers that are smaller or equal
 * to x.
 */
 uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x) {
     uint64_t size = 0;
     uint32_t xhigh = x >> 16;
     for (int i = 0; i < bm->high_low_container.size; i++) {
         uint32_t key = bm->high_low_container.keys[i];
         if (xhigh > key) {
             size +=
                 container_get_cardinality(bm->high_low_container.containers[i],
                                           bm->high_low_container.typecodes[i]);
         } else if (xhigh == key) {
             return size + container_rank(bm->high_low_container.containers[i],
                                          bm->high_low_container.typecodes[i],
                                          x & 0xFFFF);
         } else {
             return size;
         }
     }
     return size;
 }

 /**
 * roaring_bitmap_smallest returns the smallest value in the set.
 * Returns UINT32_MAX if the set is empty.
 */
 uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *bm) {
     if (bm->high_low_container.size > 0) {
         void *container = bm->high_low_container.containers[0];
         uint8_t typecode = bm->high_low_container.typecodes[0];
         uint32_t key = bm->high_low_container.keys[0];
         uint32_t lowvalue = container_minimum(container, typecode);
         return lowvalue | (key << 16);
     }
     return UINT32_MAX;
 }

 /**
 * roaring_bitmap_smallest returns the greatest value in the set.
 * Returns 0 if the set is empty.
 */
 uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *bm) {
     if (bm->high_low_container.size > 0) {
         void *container =
             bm->high_low_container.containers[bm->high_low_container.size - 1];
         uint8_t typecode =
             bm->high_low_container.typecodes[bm->high_low_container.size - 1];
         uint32_t key =
             bm->high_low_container.keys[bm->high_low_container.size - 1];
         uint32_t lowvalue = container_maximum(container, typecode);
         return lowvalue | (key << 16);
     }
     return 0;
 }

 bool roaring_bitmap_select(const roaring_bitmap_t *bm, uint32_t rank,
                            uint32_t *element) {
     void *container;
     uint8_t typecode;
     uint16_t key;
     uint32_t start_rank = 0;
     int i = 0;
     bool valid = false;
     while (!valid && i < bm->high_low_container.size) {
         container = bm->high_low_container.containers[i];
         typecode = bm->high_low_container.typecodes[i];
         valid =
             container_select(container, typecode, &start_rank, rank, element);
         i++;
     }

     if (valid) {
         key = bm->high_low_container.keys[i - 1];
         *element |= (key << 16);
         return true;
     } else
         return false;
 }

 bool roaring_bitmap_intersect(const roaring_bitmap_t *x1,
                                      const roaring_bitmap_t *x2) {
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     uint64_t answer = 0;
     int pos1 = 0, pos2 = 0;

     while (pos1 < length1 && pos2 < length2) {
         const uint16_t s1 = ra_get_key_at_index(& x1->high_low_container, pos1);
         const uint16_t s2 = ra_get_key_at_index(& x2->high_low_container, pos2);

         if (s1 == s2) {
             uint8_t container_type_1, container_type_2;
             void *c1 = ra_get_container_at_index(& x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(& x2->high_low_container, pos2,
                                                  &container_type_2);
             if( container_intersect(c1, container_type_1, c2, container_type_2) ) return true;
             ++pos1;
             ++pos2;
         } else if (s1 < s2) {  // s1 < s2
             pos1 = ra_advance_until(& x1->high_low_container, s2, pos1);
         } else {  // s1 > s2
             pos2 = ra_advance_until(& x2->high_low_container, s1, pos2);
         }
     }
     return answer;
 }


 uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1,
                                         const roaring_bitmap_t *x2) {
     const int length1 = x1->high_low_container.size,
               length2 = x2->high_low_container.size;
     uint64_t answer = 0;
     int pos1 = 0, pos2 = 0;

     while (pos1 < length1 && pos2 < length2) {
         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         if (s1 == s2) {
             uint8_t container_type_1, container_type_2;
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             answer += container_and_cardinality(c1, container_type_1, c2,
                                                 container_type_2);
             ++pos1;
             ++pos2;
         } else if (s1 < s2) {  // s1 < s2
             pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
         } else {  // s1 > s2
             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
         }
     }
     return answer;
 }

 double roaring_bitmap_jaccard_index(const roaring_bitmap_t *x1,
                                     const roaring_bitmap_t *x2) {
     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
     const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
     return (double)inter / (double)(c1 + c2 - inter);
 }

 uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *x1,
                                        const roaring_bitmap_t *x2) {
     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
     const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
     return c1 + c2 - inter;
 }

 uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *x1,
                                            const roaring_bitmap_t *x2) {
     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
     return c1 - inter;
 }

 uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *x1,
                                         const roaring_bitmap_t *x2) {
     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
     const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
     return c1 + c2 - 2 * inter;
 }


 /**
  * Check whether a range of values from range_start (included) to range_end (excluded) is present
  */
 bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_start, uint64_t range_end) {
     if(range_end >= UINT64_C(0x100000000)) {
         range_end = UINT64_C(0x100000000);
     }
     if (range_start >= range_end) return true;  // empty range are always contained!
     if (range_end - range_start == 1) return roaring_bitmap_contains(r, (uint32_t)range_start);
     uint16_t hb_rs = (uint16_t)(range_start >> 16);
     uint16_t hb_re = (uint16_t)((range_end - 1) >> 16);
     const int32_t span = hb_re - hb_rs;
     const int32_t hlc_sz = ra_get_size(&r->high_low_container);
     if (hlc_sz < span + 1) {
       return false;
     }
     int32_t is = ra_get_index(&r->high_low_container, hb_rs);
     int32_t ie = ra_get_index(&r->high_low_container, hb_re);
     ie = (ie < 0 ? -ie - 1 : ie);
     if ((is < 0) || ((ie - is) != span)) {
        return false;
     }
     const uint32_t lb_rs = range_start & 0xFFFF;
     const uint32_t lb_re = ((range_end - 1) & 0xFFFF) + 1;
     uint8_t typecode;
     void *container = ra_get_container_at_index(&r->high_low_container, is, &typecode);
     if (hb_rs == hb_re) {
       return container_contains_range(container, lb_rs, lb_re, typecode);
     }
     if (!container_contains_range(container, lb_rs, 1 << 16, typecode)) {
       return false;
     }
     assert(ie < hlc_sz); // would indicate an algorithmic bug
     container = ra_get_container_at_index(&r->high_low_container, ie, &typecode);
     if (!container_contains_range(container, 0, lb_re, typecode)) {
         return false;
     }
     for (int32_t i = is + 1; i < ie; ++i) {
         container = ra_get_container_at_index(&r->high_low_container, i, &typecode);
         if (!container_is_full(container, typecode) ) {
           return false;
         }
     }
     return true;
 }


 bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *ra1,
                                             const roaring_bitmap_t *ra2) {
     return (roaring_bitmap_get_cardinality(ra2) >
                 roaring_bitmap_get_cardinality(ra1) &&
             roaring_bitmap_is_subset(ra1, ra2));
 }


 /*
  * FROZEN SERIALIZATION FORMAT DESCRIPTION
  *
  * -- (beginning must be aligned by 32 bytes) --
  * <bitset_data> uint64_t[BITSET_CONTAINER_SIZE_IN_WORDS * num_bitset_containers]
  * <run_data>    rle16_t[total number of rle elements in all run containers]
  * <array_data>  uint16_t[total number of array elements in all array containers]
  * <keys>        uint16_t[num_containers]
  * <counts>      uint16_t[num_containers]
  * <typecodes>   uint8_t[num_containers]
  * <header>      uint32_t
  *
  * <header> is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits)
  * and the number of containers (17 bits).
  *
  * <counts> stores number of elements for every container.
  * Its meaning depends on container type.
  * For array and bitset containers, this value is the container cardinality minus one.
  * For run container, it is the number of rle_t elements (n_runs).
  *
  * <bitset_data>,<array_data>,<run_data> are flat arrays of elements of
  * all containers of respective type.
  *
  * <*_data> and <keys> are kept close together because they are not accessed
  * during deserilization. This may reduce IO in case of large mapped bitmaps.
  * All members have their native alignments during deserilization except <header>,
  * which is not guaranteed to be aligned by 4 bytes.
  */

 size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *rb) {
     const roaring_array_t *ra = &rb->high_low_container;
     size_t num_bytes = 0;
     for (int32_t i = 0; i < ra->size; i++) {
         switch (ra->typecodes[i]) {
             case BITSET_CONTAINER_TYPE_CODE: {
                 num_bytes += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
                 break;
             }
             case RUN_CONTAINER_TYPE_CODE: {
                 const run_container_t *run =
                         (const run_container_t *) ra->containers[i];
                 num_bytes += run->n_runs * sizeof(rle16_t);
                 break;
             }
             case ARRAY_CONTAINER_TYPE_CODE: {
                 const array_container_t *array =
                         (const array_container_t *) ra->containers[i];
                 num_bytes += array->cardinality * sizeof(uint16_t);
                 break;
             }
             default:
                 __builtin_unreachable();
         }
     }
     num_bytes += (2 + 2 + 1) * ra->size; // keys, counts, typecodes
     num_bytes += 4; // header
     return num_bytes;
 }

 inline static void *arena_alloc(char **arena, size_t num_bytes) {
     char *res = *arena;
     *arena += num_bytes;
     return res;
 }

 void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *rb, char *buf) {
     /*
      * Note: we do not require user to supply spicificly aligned buffer.
      * Thus we have to use memcpy() everywhere.
      */

     const roaring_array_t *ra = &rb->high_low_container;

     size_t bitset_zone_size = 0;
     size_t run_zone_size = 0;
     size_t array_zone_size = 0;
     for (int32_t i = 0; i < ra->size; i++) {
         switch (ra->typecodes[i]) {
             case BITSET_CONTAINER_TYPE_CODE: {
                 bitset_zone_size +=
                         BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
                 break;
             }
             case RUN_CONTAINER_TYPE_CODE: {
                 const run_container_t *run =
                         (const run_container_t *) ra->containers[i];
                 run_zone_size += run->n_runs * sizeof(rle16_t);
                 break;
             }
             case ARRAY_CONTAINER_TYPE_CODE: {
                 const array_container_t *array =
                         (const array_container_t *) ra->containers[i];
                 array_zone_size += array->cardinality * sizeof(uint16_t);
                 break;
             }
             default:
                 __builtin_unreachable();
         }
     }

     uint64_t *bitset_zone = (uint64_t *)arena_alloc(&buf, bitset_zone_size);
     rle16_t *run_zone = (rle16_t *)arena_alloc(&buf, run_zone_size);
     uint16_t *array_zone = (uint16_t *)arena_alloc(&buf, array_zone_size);
     uint16_t *key_zone = (uint16_t *)arena_alloc(&buf, 2*ra->size);
     uint16_t *count_zone = (uint16_t *)arena_alloc(&buf, 2*ra->size);
     uint8_t *typecode_zone = (uint8_t *)arena_alloc(&buf, ra->size);
     uint32_t *header_zone = (uint32_t *)arena_alloc(&buf, 4);

     for (int32_t i = 0; i < ra->size; i++) {
         uint16_t count;
         switch (ra->typecodes[i]) {
             case BITSET_CONTAINER_TYPE_CODE: {
                 const bitset_container_t *bitset =
                         (const bitset_container_t *) ra->containers[i];
                 memcpy(bitset_zone, bitset->array,
                        BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
                 bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS;
                 if (bitset->cardinality != BITSET_UNKNOWN_CARDINALITY) {
                     count = bitset->cardinality - 1;
                 } else {
                     count = bitset_container_compute_cardinality(bitset) - 1;
                 }
                 break;
             }
             case RUN_CONTAINER_TYPE_CODE: {
                 const run_container_t *run =
                         (const run_container_t *) ra->containers[i];
                 size_t num_bytes = run->n_runs * sizeof(rle16_t);
                 memcpy(run_zone, run->runs, num_bytes);
                 run_zone += run->n_runs;
                 count = run->n_runs;
                 break;
             }
             case ARRAY_CONTAINER_TYPE_CODE: {
                 const array_container_t *array =
                         (const array_container_t *) ra->containers[i];
                 size_t num_bytes = array->cardinality * sizeof(uint16_t);
                 memcpy(array_zone, array->array, num_bytes);
                 array_zone += array->cardinality;
                 count = array->cardinality - 1;
                 break;
             }
             default:
                 __builtin_unreachable();
         }
         memcpy(&count_zone[i], &count, 2);
     }
     memcpy(key_zone, ra->keys, ra->size * sizeof(uint16_t));
     memcpy(typecode_zone, ra->typecodes, ra->size * sizeof(uint8_t));
     uint32_t header = ((uint32_t)ra->size << 15) | FROZEN_COOKIE;
     memcpy(header_zone, &header, 4);
 }

 const roaring_bitmap_t *
 roaring_bitmap_frozen_view(const char *buf, size_t length) {
     if ((uintptr_t)buf % 32 != 0) {
         return NULL;
     }

     // cookie and num_containers
     if (length < 4) {
         return NULL;
     }
     uint32_t header;
     memcpy(&header, buf + length - 4, 4); // header may be misaligned
     if ((header & 0x7FFF) != FROZEN_COOKIE) {
         return NULL;
     }
     int32_t num_containers = (header >> 15);

     // typecodes, counts and keys
     if (length < 4 + (size_t)num_containers * (1 + 2 + 2)) {
         return NULL;
     }
     uint16_t *keys = (uint16_t *)(buf + length - 4 - num_containers * 5);
     uint16_t *counts = (uint16_t *)(buf + length - 4 - num_containers * 3);
     uint8_t *typecodes = (uint8_t *)(buf + length - 4 - num_containers * 1);

     // {bitset,array,run}_zone
     int32_t num_bitset_containers = 0;
     int32_t num_run_containers = 0;
     int32_t num_array_containers = 0;
     size_t bitset_zone_size = 0;
     size_t run_zone_size = 0;
     size_t array_zone_size = 0;
     for (int32_t i = 0; i < num_containers; i++) {
         switch (typecodes[i]) {
             case BITSET_CONTAINER_TYPE_CODE:
                 num_bitset_containers++;
                 bitset_zone_size += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
                 break;
             case RUN_CONTAINER_TYPE_CODE:
                 num_run_containers++;
                 run_zone_size += counts[i] * sizeof(rle16_t);
                 break;
             case ARRAY_CONTAINER_TYPE_CODE:
                 num_array_containers++;
                 array_zone_size += (counts[i] + UINT32_C(1)) * sizeof(uint16_t);
                 break;
             default:
                 return NULL;
         }
     }
     if (length != bitset_zone_size + run_zone_size + array_zone_size +
                   5 * num_containers + 4) {
         return NULL;
     }
     uint64_t *bitset_zone = (uint64_t*) (buf);
     rle16_t *run_zone = (rle16_t*) (buf + bitset_zone_size);
     uint16_t *array_zone = (uint16_t*) (buf + bitset_zone_size + run_zone_size);

     size_t alloc_size = 0;
     alloc_size += sizeof(roaring_bitmap_t);
     alloc_size += num_containers * sizeof(void *);
     alloc_size += num_bitset_containers * sizeof(bitset_container_t);
     alloc_size += num_run_containers * sizeof(run_container_t);
     alloc_size += num_array_containers * sizeof(array_container_t);

     char *arena = (char *)malloc(alloc_size);
     if (arena == NULL) {
         return NULL;
     }

     roaring_bitmap_t *rb = (roaring_bitmap_t *)
             arena_alloc(&arena, sizeof(roaring_bitmap_t));
     rb->high_low_container.flags = ROARING_FLAG_FROZEN;
     rb->high_low_container.allocation_size = num_containers;
     rb->high_low_container.size = num_containers;
     rb->high_low_container.keys = (uint16_t *)keys;
     rb->high_low_container.typecodes = (uint8_t *)typecodes;
     rb->high_low_container.containers =
             (void **)arena_alloc(&arena, sizeof(void*) * num_containers);
     for (int32_t i = 0; i < num_containers; i++) {
         switch (typecodes[i]) {
             case BITSET_CONTAINER_TYPE_CODE: {
                 bitset_container_t *bitset = (bitset_container_t *)
                         arena_alloc(&arena, sizeof(bitset_container_t));
                 bitset->array = bitset_zone;
                 bitset->cardinality = counts[i] + UINT32_C(1);
                 rb->high_low_container.containers[i] = bitset;
                 bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS;
                 break;
             }
             case RUN_CONTAINER_TYPE_CODE: {
                 run_container_t *run = (run_container_t *)
                         arena_alloc(&arena, sizeof(run_container_t));
                 run->capacity = counts[i];
                 run->n_runs = counts[i];
                 run->runs = run_zone;
                 rb->high_low_container.containers[i] = run;
                 run_zone += run->n_runs;
                 break;
             }
             case ARRAY_CONTAINER_TYPE_CODE: {
                 array_container_t *array = (array_container_t *)
                         arena_alloc(&arena, sizeof(array_container_t));
                 array->capacity = counts[i] + UINT32_C(1);
                 array->cardinality = counts[i] + UINT32_C(1);
                 array->array = array_zone;
                 rb->high_low_container.containers[i] = array;
                 array_zone += counts[i] + UINT32_C(1);
                 break;
             }
             default:
                 free(arena);
                 return NULL;
         }
     }

     return rb;
 }
 /* end file src/roaring.c */
 /* begin file src/roaring_array.c */
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>


 // Convention: [0,ra->size) all elements are initialized
 //  [ra->size, ra->allocation_size) is junk and contains nothing needing freeing

 static bool realloc_array(roaring_array_t *ra, int32_t new_capacity) {
     // because we combine the allocations, it is not possible to use realloc
     /*ra->keys =
     (uint16_t *)realloc(ra->keys, sizeof(uint16_t) * new_capacity);
 ra->containers =
     (void **)realloc(ra->containers, sizeof(void *) * new_capacity);
 ra->typecodes =
     (uint8_t *)realloc(ra->typecodes, sizeof(uint8_t) * new_capacity);
 if (!ra->keys || !ra->containers || !ra->typecodes) {
     free(ra->keys);
     free(ra->containers);
     free(ra->typecodes);
     return false;
 }*/

     if ( new_capacity == 0 ) {
       free(ra->containers);
       ra->containers = NULL;
       ra->keys = NULL;
       ra->typecodes = NULL;
       ra->allocation_size = 0;
       return true;
     }
     const size_t memoryneeded =
         new_capacity * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
     void *bigalloc = malloc(memoryneeded);
     if (!bigalloc) return false;
     void *oldbigalloc = ra->containers;
     void **newcontainers = (void **)bigalloc;
     uint16_t *newkeys = (uint16_t *)(newcontainers + new_capacity);
     uint8_t *newtypecodes = (uint8_t *)(newkeys + new_capacity);
     assert((char *)(newtypecodes + new_capacity) ==
            (char *)bigalloc + memoryneeded);
     if(ra->size > 0) {
       memcpy(newcontainers, ra->containers, sizeof(void *) * ra->size);
       memcpy(newkeys, ra->keys, sizeof(uint16_t) * ra->size);
       memcpy(newtypecodes, ra->typecodes, sizeof(uint8_t) * ra->size);
     }
     ra->containers = newcontainers;
     ra->keys = newkeys;
     ra->typecodes = newtypecodes;
     ra->allocation_size = new_capacity;
     free(oldbigalloc);
     return true;
 }

 bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap) {
     if (!new_ra) return false;
     ra_init(new_ra);

     if (cap > INT32_MAX) { return false; }

     if(cap > 0) {
       void *bigalloc =
         malloc(cap * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t)));
       if( bigalloc == NULL ) return false;
       new_ra->containers = (void **)bigalloc;
       new_ra->keys = (uint16_t *)(new_ra->containers + cap);
       new_ra->typecodes = (uint8_t *)(new_ra->keys + cap);
       // Narrowing is safe because of above check
       new_ra->allocation_size = (int32_t)cap;
     }
     return true;
 }

 int ra_shrink_to_fit(roaring_array_t *ra) {
     int savings = (ra->allocation_size - ra->size) *
                   (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
     if (!realloc_array(ra, ra->size)) {
       return 0;
     }
     ra->allocation_size = ra->size;
     return savings;
 }

 void ra_init(roaring_array_t *new_ra) {
     if (!new_ra) { return; }
     new_ra->keys = NULL;
     new_ra->containers = NULL;
     new_ra->typecodes = NULL;

     new_ra->allocation_size = 0;
     new_ra->size = 0;
     new_ra->flags = 0;
 }

 bool ra_copy(const roaring_array_t *source, roaring_array_t *dest,
              bool copy_on_write) {
     if (!ra_init_with_capacity(dest, source->size)) return false;
     dest->size = source->size;
     dest->allocation_size = source->size;
     if(dest->size > 0) {
       memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
     }
     // we go through the containers, turning them into shared containers...
     if (copy_on_write) {
         for (int32_t i = 0; i < dest->size; ++i) {
             source->containers[i] = get_copy_of_container(
                 source->containers[i], &source->typecodes[i], copy_on_write);
         }
         // we do a shallow copy to the other bitmap
         if(dest->size > 0) {
           memcpy(dest->containers, source->containers,
                dest->size * sizeof(void *));
           memcpy(dest->typecodes, source->typecodes,
                dest->size * sizeof(uint8_t));
         }
     } else {
         if(dest->size > 0) {
           memcpy(dest->typecodes, source->typecodes,
                dest->size * sizeof(uint8_t));
         }
         for (int32_t i = 0; i < dest->size; i++) {
             dest->containers[i] =
                 container_clone(source->containers[i], source->typecodes[i]);
             if (dest->containers[i] == NULL) {
                 for (int32_t j = 0; j < i; j++) {
                     container_free(dest->containers[j], dest->typecodes[j]);
                 }
                 ra_clear_without_containers(dest);
                 return false;
             }
         }
     }
     return true;
 }

 bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest,
                   bool copy_on_write) {
     ra_clear_containers(dest);  // we are going to overwrite them
     if (dest->allocation_size < source->size) {
         if (!realloc_array(dest, source->size)) {
             return false;
         }
     }
     dest->size = source->size;
     memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
     // we go through the containers, turning them into shared containers...
     if (copy_on_write) {
         for (int32_t i = 0; i < dest->size; ++i) {
             source->containers[i] = get_copy_of_container(
                 source->containers[i], &source->typecodes[i], copy_on_write);
         }
         // we do a shallow copy to the other bitmap
         memcpy(dest->containers, source->containers,
                dest->size * sizeof(void *));
         memcpy(dest->typecodes, source->typecodes,
                dest->size * sizeof(uint8_t));
     } else {
         memcpy(dest->typecodes, source->typecodes,
                dest->size * sizeof(uint8_t));
         for (int32_t i = 0; i < dest->size; i++) {
             dest->containers[i] =
                 container_clone(source->containers[i], source->typecodes[i]);
             if (dest->containers[i] == NULL) {
                 for (int32_t j = 0; j < i; j++) {
                     container_free(dest->containers[j], dest->typecodes[j]);
                 }
                 ra_clear_without_containers(dest);
                 return false;
             }
         }
     }
     return true;
 }

 void ra_clear_containers(roaring_array_t *ra) {
     for (int32_t i = 0; i < ra->size; ++i) {
         container_free(ra->containers[i], ra->typecodes[i]);
     }
 }

 void ra_reset(roaring_array_t *ra) {
   ra_clear_containers(ra);
   ra->size = 0;
   ra_shrink_to_fit(ra);
 }

 void ra_clear_without_containers(roaring_array_t *ra) {
     free(ra->containers);    // keys and typecodes are allocated with containers
     ra->size = 0;
     ra->allocation_size = 0;
     ra->containers = NULL;
     ra->keys = NULL;
     ra->typecodes = NULL;
 }

 void ra_clear(roaring_array_t *ra) {
     ra_clear_containers(ra);
     ra_clear_without_containers(ra);
 }

 bool extend_array(roaring_array_t *ra, int32_t k) {
     int32_t desired_size = ra->size + k;
     assert(desired_size <= MAX_CONTAINERS);
     if (desired_size > ra->allocation_size) {
         int32_t new_capacity =
             (ra->size < 1024) ? 2 * desired_size : 5 * desired_size / 4;
         if (new_capacity > MAX_CONTAINERS) {
             new_capacity = MAX_CONTAINERS;
         }

         return realloc_array(ra, new_capacity);
     }
     return true;
 }

 void ra_append(roaring_array_t *ra, uint16_t key, void *container,
                uint8_t typecode) {
     extend_array(ra, 1);
     const int32_t pos = ra->size;

     ra->keys[pos] = key;
     ra->containers[pos] = container;
     ra->typecodes[pos] = typecode;
     ra->size++;
 }

 void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa,
                     uint16_t index, bool copy_on_write) {
     extend_array(ra, 1);
     const int32_t pos = ra->size;

     // old contents is junk not needing freeing
     ra->keys[pos] = sa->keys[index];
     // the shared container will be in two bitmaps
     if (copy_on_write) {
         sa->containers[index] = get_copy_of_container(
             sa->containers[index], &sa->typecodes[index], copy_on_write);
         ra->containers[pos] = sa->containers[index];
         ra->typecodes[pos] = sa->typecodes[index];
     } else {
         ra->containers[pos] =
             container_clone(sa->containers[index], sa->typecodes[index]);
         ra->typecodes[pos] = sa->typecodes[index];
     }
     ra->size++;
 }

 void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa,
                             uint16_t stopping_key, bool copy_on_write) {
     for (int32_t i = 0; i < sa->size; ++i) {
         if (sa->keys[i] >= stopping_key) break;
         ra_append_copy(ra, sa, i, copy_on_write);
     }
 }

 void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa,
                           int32_t start_index, int32_t end_index,
                           bool copy_on_write) {
     extend_array(ra, end_index - start_index);
     for (int32_t i = start_index; i < end_index; ++i) {
         const int32_t pos = ra->size;
         ra->keys[pos] = sa->keys[i];
         if (copy_on_write) {
             sa->containers[i] = get_copy_of_container(
                 sa->containers[i], &sa->typecodes[i], copy_on_write);
             ra->containers[pos] = sa->containers[i];
             ra->typecodes[pos] = sa->typecodes[i];
         } else {
             ra->containers[pos] =
                 container_clone(sa->containers[i], sa->typecodes[i]);
             ra->typecodes[pos] = sa->typecodes[i];
         }
         ra->size++;
     }
 }

 void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa,
                             uint16_t before_start, bool copy_on_write) {
     int start_location = ra_get_index(sa, before_start);
     if (start_location >= 0)
         ++start_location;
     else
         start_location = -start_location - 1;
     ra_append_copy_range(ra, sa, start_location, sa->size, copy_on_write);
 }

 void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa,
                           int32_t start_index, int32_t end_index) {
     extend_array(ra, end_index - start_index);

     for (int32_t i = start_index; i < end_index; ++i) {
         const int32_t pos = ra->size;

         ra->keys[pos] = sa->keys[i];
         ra->containers[pos] = sa->containers[i];
         ra->typecodes[pos] = sa->typecodes[i];
         ra->size++;
     }
 }

 void ra_append_range(roaring_array_t *ra, roaring_array_t *sa,
                      int32_t start_index, int32_t end_index,
                      bool copy_on_write) {
     extend_array(ra, end_index - start_index);

     for (int32_t i = start_index; i < end_index; ++i) {
         const int32_t pos = ra->size;
         ra->keys[pos] = sa->keys[i];
         if (copy_on_write) {
             sa->containers[i] = get_copy_of_container(
                 sa->containers[i], &sa->typecodes[i], copy_on_write);
             ra->containers[pos] = sa->containers[i];
             ra->typecodes[pos] = sa->typecodes[i];
         } else {
             ra->containers[pos] =
                 container_clone(sa->containers[i], sa->typecodes[i]);
             ra->typecodes[pos] = sa->typecodes[i];
         }
         ra->size++;
     }
 }

 uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) {
     return ra->keys[i];
 }

 // everything skipped over is freed
 int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos) {
     while (pos < ra->size && ra->keys[pos] < x) {
         container_free(ra->containers[pos], ra->typecodes[pos]);
         ++pos;
     }
     return pos;
 }

 void ra_insert_new_key_value_at(roaring_array_t *ra, int32_t i, uint16_t key,
                                 void *container, uint8_t typecode) {
     extend_array(ra, 1);
     // May be an optimization opportunity with DIY memmove
     memmove(&(ra->keys[i + 1]), &(ra->keys[i]),
             sizeof(uint16_t) * (ra->size - i));
     memmove(&(ra->containers[i + 1]), &(ra->containers[i]),
             sizeof(void *) * (ra->size - i));
     memmove(&(ra->typecodes[i + 1]), &(ra->typecodes[i]),
             sizeof(uint8_t) * (ra->size - i));
     ra->keys[i] = key;
     ra->containers[i] = container;
     ra->typecodes[i] = typecode;
     ra->size++;
 }

 // note: Java routine set things to 0, enabling GC.
 // Java called it "resize" but it was always used to downsize.
 // Allowing upsize would break the conventions about
 // valid containers below ra->size.

 void ra_downsize(roaring_array_t *ra, int32_t new_length) {
     assert(new_length <= ra->size);
     ra->size = new_length;
 }

 void ra_remove_at_index(roaring_array_t *ra, int32_t i) {
     memmove(&(ra->containers[i]), &(ra->containers[i + 1]),
             sizeof(void *) * (ra->size - i - 1));
     memmove(&(ra->keys[i]), &(ra->keys[i + 1]),
             sizeof(uint16_t) * (ra->size - i - 1));
     memmove(&(ra->typecodes[i]), &(ra->typecodes[i + 1]),
             sizeof(uint8_t) * (ra->size - i - 1));
     ra->size--;
 }

 void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i) {
     container_free(ra->containers[i], ra->typecodes[i]);
     ra_remove_at_index(ra, i);
 }

 // used in inplace andNot only, to slide left the containers from
 // the mutated RoaringBitmap that are after the largest container of
 // the argument RoaringBitmap.  In use it should be followed by a call to
 // downsize.
 //
 void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end,
                    uint32_t new_begin) {
     assert(begin <= end);
     assert(new_begin < begin);

     const int range = end - begin;

     // We ensure to previously have freed overwritten containers
     // that are not copied elsewhere

     memmove(&(ra->containers[new_begin]), &(ra->containers[begin]),
             sizeof(void *) * range);
     memmove(&(ra->keys[new_begin]), &(ra->keys[begin]),
             sizeof(uint16_t) * range);
     memmove(&(ra->typecodes[new_begin]), &(ra->typecodes[begin]),
             sizeof(uint8_t) * range);
 }

 void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance) {
     if (distance > 0) {
         extend_array(ra, distance);
     }
     int32_t srcpos = ra->size - count;
     int32_t dstpos = srcpos + distance;
     memmove(&(ra->keys[dstpos]), &(ra->keys[srcpos]),
             sizeof(uint16_t) * count);
     memmove(&(ra->containers[dstpos]), &(ra->containers[srcpos]),
             sizeof(void *) * count);
     memmove(&(ra->typecodes[dstpos]), &(ra->typecodes[srcpos]),
             sizeof(uint8_t) * count);
     ra->size += distance;
 }


 void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans) {
     size_t ctr = 0;
     for (int32_t i = 0; i < ra->size; ++i) {
         int num_added = container_to_uint32_array(
             ans + ctr, ra->containers[i], ra->typecodes[i],
             ((uint32_t)ra->keys[i]) << 16);
         ctr += num_added;
     }
 }

 bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset, size_t limit, uint32_t *ans) {
     size_t ctr = 0;
     size_t dtr = 0;

     size_t t_limit = 0;

     bool first = false;
     size_t first_skip = 0;

     uint32_t *t_ans = NULL;
     size_t cur_len = 0;

     for (int i = 0; i < ra->size; ++i) {

         const void *container = container_unwrap_shared(ra->containers[i], &ra->typecodes[i]);
         switch (ra->typecodes[i]) {
             case BITSET_CONTAINER_TYPE_CODE:
                 t_limit = ((const bitset_container_t *)container)->cardinality;
                 break;
             case ARRAY_CONTAINER_TYPE_CODE:
                 t_limit = ((const array_container_t *)container)->cardinality;
                 break;
             case RUN_CONTAINER_TYPE_CODE:
                 t_limit = run_container_cardinality((const run_container_t *)container);
                 break;
             case SHARED_CONTAINER_TYPE_CODE:
             default:
                 __builtin_unreachable();
         }
         if (ctr + t_limit - 1 >= offset && ctr < offset + limit){
             if (!first){
                 //first_skip = t_limit - (ctr + t_limit - offset);
                 first_skip = offset - ctr;
                 first = true;
                 t_ans = (uint32_t *)malloc(sizeof(*t_ans) * (first_skip + limit));
                 if(t_ans == NULL) {
                   return false;
                 }
                 memset(t_ans, 0, sizeof(*t_ans) * (first_skip + limit)) ;
                 cur_len = first_skip + limit;
             }
             if (dtr + t_limit > cur_len){
                 uint32_t * append_ans = (uint32_t *)malloc(sizeof(*append_ans) * (cur_len + t_limit));
                 if(append_ans == NULL) {
                   if(t_ans != NULL) free(t_ans);
                   return false;
                 }
                 memset(append_ans, 0, sizeof(*append_ans) * (cur_len + t_limit));
                 cur_len = cur_len + t_limit;
                 memcpy(append_ans, t_ans, dtr * sizeof(uint32_t));
                 free(t_ans);
                 t_ans = append_ans;
             }
             switch (ra->typecodes[i]) {
                 case BITSET_CONTAINER_TYPE_CODE:
                     container_to_uint32_array(
                         t_ans + dtr, (const bitset_container_t *)container,  ra->typecodes[i],
                         ((uint32_t)ra->keys[i]) << 16);
                     break;
                 case ARRAY_CONTAINER_TYPE_CODE:
                     container_to_uint32_array(
                         t_ans + dtr, (const array_container_t *)container, ra->typecodes[i],
                         ((uint32_t)ra->keys[i]) << 16);
                     break;
                 case RUN_CONTAINER_TYPE_CODE:
                     container_to_uint32_array(
                         t_ans + dtr, (const run_container_t *)container, ra->typecodes[i],
                         ((uint32_t)ra->keys[i]) << 16);
                     break;
                 case SHARED_CONTAINER_TYPE_CODE:
                 default:
                     __builtin_unreachable();
             }
             dtr += t_limit;
         }
         ctr += t_limit;
         if (dtr-first_skip >= limit) break;
     }
     if(t_ans != NULL) {
       memcpy(ans, t_ans+first_skip, limit * sizeof(uint32_t));
       free(t_ans);
     }
     return true;
 }

 bool ra_has_run_container(const roaring_array_t *ra) {
     for (int32_t k = 0; k < ra->size; ++k) {
         if (get_container_type(ra->containers[k], ra->typecodes[k]) ==
             RUN_CONTAINER_TYPE_CODE)
             return true;
     }
     return false;
 }

 uint32_t ra_portable_header_size(const roaring_array_t *ra) {
     if (ra_has_run_container(ra)) {
         if (ra->size <
             NO_OFFSET_THRESHOLD) {  // for small bitmaps, we omit the offsets
             return 4 + (ra->size + 7) / 8 + 4 * ra->size;
         }
         return 4 + (ra->size + 7) / 8 +
                8 * ra->size;  // - 4 because we pack the size with the cookie
     } else {
         return 4 + 4 + 8 * ra->size;
     }
 }

 size_t ra_portable_size_in_bytes(const roaring_array_t *ra) {
     size_t count = ra_portable_header_size(ra);

     for (int32_t k = 0; k < ra->size; ++k) {
         count += container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
     }
     return count;
 }

 size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) {
     char *initbuf = buf;
     uint32_t startOffset = 0;
     bool hasrun = ra_has_run_container(ra);
     if (hasrun) {
         uint32_t cookie = SERIAL_COOKIE | ((ra->size - 1) << 16);
         memcpy(buf, &cookie, sizeof(cookie));
         buf += sizeof(cookie);
         uint32_t s = (ra->size + 7) / 8;
         uint8_t *bitmapOfRunContainers = (uint8_t *)calloc(s, 1);
         assert(bitmapOfRunContainers != NULL);  // todo: handle
         for (int32_t i = 0; i < ra->size; ++i) {
             if (get_container_type(ra->containers[i], ra->typecodes[i]) ==
                 RUN_CONTAINER_TYPE_CODE) {
                 bitmapOfRunContainers[i / 8] |= (1 << (i % 8));
             }
         }
         memcpy(buf, bitmapOfRunContainers, s);
         buf += s;
         free(bitmapOfRunContainers);
         if (ra->size < NO_OFFSET_THRESHOLD) {
             startOffset = 4 + 4 * ra->size + s;
         } else {
             startOffset = 4 + 8 * ra->size + s;
         }
     } else {  // backwards compatibility
         uint32_t cookie = SERIAL_COOKIE_NO_RUNCONTAINER;

         memcpy(buf, &cookie, sizeof(cookie));
         buf += sizeof(cookie);
         memcpy(buf, &ra->size, sizeof(ra->size));
         buf += sizeof(ra->size);

         startOffset = 4 + 4 + 4 * ra->size + 4 * ra->size;
     }
     for (int32_t k = 0; k < ra->size; ++k) {
         memcpy(buf, &ra->keys[k], sizeof(ra->keys[k]));
         buf += sizeof(ra->keys[k]);
         // get_cardinality returns a value in [1,1<<16], subtracting one
         // we get [0,1<<16 - 1] which fits in 16 bits
         uint16_t card = (uint16_t)(
             container_get_cardinality(ra->containers[k], ra->typecodes[k]) - 1);
         memcpy(buf, &card, sizeof(card));
         buf += sizeof(card);
     }
     if ((!hasrun) || (ra->size >= NO_OFFSET_THRESHOLD)) {
         // writing the containers offsets
         for (int32_t k = 0; k < ra->size; k++) {
             memcpy(buf, &startOffset, sizeof(startOffset));
             buf += sizeof(startOffset);
             startOffset =
                 startOffset +
                 container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
         }
     }
     for (int32_t k = 0; k < ra->size; ++k) {
         buf += container_write(ra->containers[k], ra->typecodes[k], buf);
     }
     return buf - initbuf;
 }

 // Quickly checks whether there is a serialized bitmap at the pointer,
 // not exceeding size "maxbytes" in bytes. This function does not allocate
 // memory dynamically.
 //
 // This function returns 0 if and only if no valid bitmap is found.
 // Otherwise, it returns how many bytes are occupied.
 //
 size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) {
     size_t bytestotal = sizeof(int32_t);// for cookie
     if(bytestotal > maxbytes) return 0;
     uint32_t cookie;
     memcpy(&cookie, buf, sizeof(int32_t));
     buf += sizeof(uint32_t);
     if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
         cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
         return 0;
     }
     int32_t size;

     if ((cookie & 0xFFFF) == SERIAL_COOKIE)
         size = (cookie >> 16) + 1;
     else {
         bytestotal += sizeof(int32_t);
         if(bytestotal > maxbytes) return 0;
         memcpy(&size, buf, sizeof(int32_t));
         buf += sizeof(uint32_t);
     }
     if (size > (1<<16)) {
        return 0; // logically impossible
     }
     char *bitmapOfRunContainers = NULL;
     bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
     if (hasrun) {
         int32_t s = (size + 7) / 8;
         bytestotal += s;
         if(bytestotal > maxbytes) return 0;
         bitmapOfRunContainers = (char *)buf;
         buf += s;
     }
     bytestotal += size * 2 * sizeof(uint16_t);
     if(bytestotal > maxbytes) return 0;
     uint16_t *keyscards = (uint16_t *)buf;
     buf += size * 2 * sizeof(uint16_t);
     if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
         // skipping the offsets
         bytestotal += size * 4;
         if(bytestotal > maxbytes) return 0;
         buf += size * 4;
     }
     // Reading the containers
     for (int32_t k = 0; k < size; ++k) {
         uint16_t tmp;
         memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp));
         uint32_t thiscard = tmp + 1;
         bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
         bool isrun = false;
         if(hasrun) {
           if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
             isbitmap = false;
             isrun = true;
           }
         }
         if (isbitmap) {
             size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
             bytestotal += containersize;
             if(bytestotal > maxbytes) return 0;
             buf += containersize;
         } else if (isrun) {
             bytestotal += sizeof(uint16_t);
             if(bytestotal > maxbytes) return 0;
             uint16_t n_runs;
             memcpy(&n_runs, buf, sizeof(uint16_t));
             buf += sizeof(uint16_t);
             size_t containersize = n_runs * sizeof(rle16_t);
             bytestotal += containersize;
             if(bytestotal > maxbytes) return 0;
             buf += containersize;
         } else {
             size_t containersize = thiscard * sizeof(uint16_t);
             bytestotal += containersize;
             if(bytestotal > maxbytes) return 0;
             buf += containersize;
         }
     }
     return bytestotal;
 }


 // this function populates answer from the content of buf (reading up to maxbytes bytes).
 // The function returns false if a properly serialized bitmap cannot be found.
 // if it returns true, readbytes is populated by how many bytes were read, we have that *readbytes <= maxbytes.
 bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const size_t maxbytes, size_t * readbytes) {
     *readbytes = sizeof(int32_t);// for cookie
     if(*readbytes > maxbytes) {
       fprintf(stderr, "Ran out of bytes while reading first 4 bytes.\n");
       return false;
     }
     uint32_t cookie;
     memcpy(&cookie, buf, sizeof(int32_t));
     buf += sizeof(uint32_t);
     if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
         cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
         fprintf(stderr, "I failed to find one of the right cookies. Found %" PRIu32 "\n",
                 cookie);
         return false;
     }
     int32_t size;

     if ((cookie & 0xFFFF) == SERIAL_COOKIE)
         size = (cookie >> 16) + 1;
     else {
         *readbytes += sizeof(int32_t);
         if(*readbytes > maxbytes) {
           fprintf(stderr, "Ran out of bytes while reading second part of the cookie.\n");
           return false;
         }
         memcpy(&size, buf, sizeof(int32_t));
         buf += sizeof(uint32_t);
     }
     if (size > (1<<16)) {
        fprintf(stderr, "You cannot have so many containers, the data must be corrupted: %" PRId32 "\n",
                 size);
        return false; // logically impossible
     }
     const char *bitmapOfRunContainers = NULL;
     bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
     if (hasrun) {
         int32_t s = (size + 7) / 8;
         *readbytes += s;
         if(*readbytes > maxbytes) {// data is corrupted?
           fprintf(stderr, "Ran out of bytes while reading run bitmap.\n");
           return false;
         }
         bitmapOfRunContainers = buf;
         buf += s;
     }
     uint16_t *keyscards = (uint16_t *)buf;

     *readbytes += size * 2 * sizeof(uint16_t);
     if(*readbytes > maxbytes) {
       fprintf(stderr, "Ran out of bytes while reading key-cardinality array.\n");
       return false;
     }
     buf += size * 2 * sizeof(uint16_t);

     bool is_ok = ra_init_with_capacity(answer, size);
     if (!is_ok) {
         fprintf(stderr, "Failed to allocate memory for roaring array. Bailing out.\n");
         return false;
     }

     for (int32_t k = 0; k < size; ++k) {
         uint16_t tmp;
         memcpy(&tmp, keyscards + 2*k, sizeof(tmp));
         answer->keys[k] = tmp;
     }
     if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
         *readbytes += size * 4;
         if(*readbytes > maxbytes) {// data is corrupted?
           fprintf(stderr, "Ran out of bytes while reading offsets.\n");
           ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
           return false;
         }

         // skipping the offsets
         buf += size * 4;
     }
     // Reading the containers
     for (int32_t k = 0; k < size; ++k) {
         uint16_t tmp;
         memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp));
         uint32_t thiscard = tmp + 1;
         bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
         bool isrun = false;
         if(hasrun) {
           if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
             isbitmap = false;
             isrun = true;
           }
         }
         if (isbitmap) {
             // we check that the read is allowed
             size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
             *readbytes += containersize;
             if(*readbytes > maxbytes) {
               fprintf(stderr, "Running out of bytes while reading a bitset container.\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             // it is now safe to read
             bitset_container_t *c = bitset_container_create();
             if(c == NULL) {// memory allocation failure
               fprintf(stderr, "Failed to allocate memory for a bitset container.\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             answer->size++;
             buf += bitset_container_read(thiscard, c, buf);
             answer->containers[k] = c;
             answer->typecodes[k] = BITSET_CONTAINER_TYPE_CODE;
         } else if (isrun) {
             // we check that the read is allowed
             *readbytes += sizeof(uint16_t);
             if(*readbytes > maxbytes) {
               fprintf(stderr, "Running out of bytes while reading a run container (header).\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             uint16_t n_runs;
             memcpy(&n_runs, buf, sizeof(uint16_t));
             size_t containersize = n_runs * sizeof(rle16_t);
             *readbytes += containersize;
             if(*readbytes > maxbytes) {// data is corrupted?
               fprintf(stderr, "Running out of bytes while reading a run container.\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             // it is now safe to read

             run_container_t *c = run_container_create();
             if(c == NULL) {// memory allocation failure
               fprintf(stderr, "Failed to allocate memory for a run container.\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             answer->size++;
             buf += run_container_read(thiscard, c, buf);
             answer->containers[k] = c;
             answer->typecodes[k] = RUN_CONTAINER_TYPE_CODE;
         } else {
             // we check that the read is allowed
             size_t containersize = thiscard * sizeof(uint16_t);
             *readbytes += containersize;
             if(*readbytes > maxbytes) {// data is corrupted?
               fprintf(stderr, "Running out of bytes while reading an array container.\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             // it is now safe to read
             array_container_t *c =
                 array_container_create_given_capacity(thiscard);
             if(c == NULL) {// memory allocation failure
               fprintf(stderr, "Failed to allocate memory for an array container.\n");
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             answer->size++;
             buf += array_container_read(thiscard, c, buf);
             answer->containers[k] = c;
             answer->typecodes[k] = ARRAY_CONTAINER_TYPE_CODE;
         }
     }
     return true;
 }
 /* end file src/roaring_array.c */
 /* begin file src/roaring_priority_queue.c */

 struct roaring_pq_element_s {
     uint64_t size;
     bool is_temporary;
     roaring_bitmap_t *bitmap;
 };

 typedef struct roaring_pq_element_s roaring_pq_element_t;

 struct roaring_pq_s {
     roaring_pq_element_t *elements;
     uint64_t size;
 };

 typedef struct roaring_pq_s roaring_pq_t;

 static inline bool compare(roaring_pq_element_t *t1, roaring_pq_element_t *t2) {
     return t1->size < t2->size;
 }

 static void pq_add(roaring_pq_t *pq, roaring_pq_element_t *t) {
     uint64_t i = pq->size;
     pq->elements[pq->size++] = *t;
     while (i > 0) {
         uint64_t p = (i - 1) >> 1;
         roaring_pq_element_t ap = pq->elements[p];
         if (!compare(t, &ap)) break;
         pq->elements[i] = ap;
         i = p;
     }
     pq->elements[i] = *t;
 }

 static void pq_free(roaring_pq_t *pq) {
     free(pq->elements);
     pq->elements = NULL;  // paranoid
     free(pq);
 }

 static void percolate_down(roaring_pq_t *pq, uint32_t i) {
     uint32_t size = (uint32_t)pq->size;
     uint32_t hsize = size >> 1;
     roaring_pq_element_t ai = pq->elements[i];
     while (i < hsize) {
         uint32_t l = (i << 1) + 1;
         uint32_t r = l + 1;
         roaring_pq_element_t bestc = pq->elements[l];
         if (r < size) {
             if (compare(pq->elements + r, &bestc)) {
                 l = r;
                 bestc = pq->elements[r];
             }
         }
         if (!compare(&bestc, &ai)) {
             break;
         }
         pq->elements[i] = bestc;
         i = l;
     }
     pq->elements[i] = ai;
 }

 static roaring_pq_t *create_pq(const roaring_bitmap_t **arr, uint32_t length) {
     roaring_pq_t *answer = (roaring_pq_t *)malloc(sizeof(roaring_pq_t));
     answer->elements =
         (roaring_pq_element_t *)malloc(sizeof(roaring_pq_element_t) * length);
     answer->size = length;
     for (uint32_t i = 0; i < length; i++) {
         answer->elements[i].bitmap = (roaring_bitmap_t *)arr[i];
         answer->elements[i].is_temporary = false;
         answer->elements[i].size =
             roaring_bitmap_portable_size_in_bytes(arr[i]);
     }
     for (int32_t i = (length >> 1); i >= 0; i--) {
         percolate_down(answer, i);
     }
     return answer;
 }

 static roaring_pq_element_t pq_poll(roaring_pq_t *pq) {
     roaring_pq_element_t ans = *pq->elements;
     if (pq->size > 1) {
         pq->elements[0] = pq->elements[--pq->size];
         percolate_down(pq, 0);
     } else
         --pq->size;
     // memmove(pq->elements,pq->elements+1,(pq->size-1)*sizeof(roaring_pq_element_t));--pq->size;
     return ans;
 }

 // this function consumes and frees the inputs
 static roaring_bitmap_t *lazy_or_from_lazy_inputs(roaring_bitmap_t *x1,
                                                   roaring_bitmap_t *x2) {
     uint8_t container_result_type = 0;
     const int length1 = ra_get_size(&x1->high_low_container),
               length2 = ra_get_size(&x2->high_low_container);
     if (0 == length1) {
         roaring_bitmap_free(x1);
         return x2;
     }
     if (0 == length2) {
         roaring_bitmap_free(x2);
         return x1;
     }
     uint32_t neededcap = length1 > length2 ? length2 : length1;
     roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
     int pos1 = 0, pos2 = 0;
     uint8_t container_type_1, container_type_2;
     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
     while (true) {
         if (s1 == s2) {
             // todo: unsharing can be inefficient as it may create a clone where
             // none
             // is needed, but it has the benefit of being easy to reason about.
             ra_unshare_container_at_index(&x1->high_low_container, pos1);
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             assert(container_type_1 != SHARED_CONTAINER_TYPE_CODE);
             ra_unshare_container_at_index(&x2->high_low_container, pos2);
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             assert(container_type_2 != SHARED_CONTAINER_TYPE_CODE);
             void *c;

             if ((container_type_2 == BITSET_CONTAINER_TYPE_CODE) &&
                 (container_type_1 != BITSET_CONTAINER_TYPE_CODE)) {
                 c = container_lazy_ior(c2, container_type_2, c1,
                                        container_type_1,
                                        &container_result_type);
                 container_free(c1, container_type_1);
                 if (c != c2) {
                     container_free(c2, container_type_2);
                 }
             } else {
                 c = container_lazy_ior(c1, container_type_1, c2,
                                        container_type_2,
                                        &container_result_type);
                 container_free(c2, container_type_2);
                 if (c != c1) {
                     container_free(c1, container_type_1);
                 }
             }
             // since we assume that the initial containers are non-empty, the
             // result here
             // can only be non-empty
             ra_append(&answer->high_low_container, s1, c,
                       container_result_type);
             ++pos1;
             ++pos2;
             if (pos1 == length1) break;
             if (pos2 == length2) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);

         } else if (s1 < s2) {  // s1 < s2
             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
                                                  &container_type_1);
             ra_append(&answer->high_low_container, s1, c1, container_type_1);
             pos1++;
             if (pos1 == length1) break;
             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);

         } else {  // s1 > s2
             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
                                                  &container_type_2);
             ra_append(&answer->high_low_container, s2, c2, container_type_2);
             pos2++;
             if (pos2 == length2) break;
             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
         }
     }
     if (pos1 == length1) {
         ra_append_move_range(&answer->high_low_container,
                              &x2->high_low_container, pos2, length2);
     } else if (pos2 == length2) {
         ra_append_move_range(&answer->high_low_container,
                              &x1->high_low_container, pos1, length1);
     }
     ra_clear_without_containers(&x1->high_low_container);
     ra_clear_without_containers(&x2->high_low_container);
     free(x1);
     free(x2);
     return answer;
 }

 /**
  * Compute the union of 'number' bitmaps using a heap. This can
  * sometimes be faster than roaring_bitmap_or_many which uses
  * a naive algorithm. Caller is responsible for freeing the
  * result.
  */
 roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number,
                                               const roaring_bitmap_t **x) {
     if (number == 0) {
         return roaring_bitmap_create();
     }
     if (number == 1) {
         return roaring_bitmap_copy(x[0]);
     }
     roaring_pq_t *pq = create_pq(x, number);
     while (pq->size > 1) {
         roaring_pq_element_t x1 = pq_poll(pq);
         roaring_pq_element_t x2 = pq_poll(pq);

         if (x1.is_temporary && x2.is_temporary) {
             roaring_bitmap_t *newb =
                 lazy_or_from_lazy_inputs(x1.bitmap, x2.bitmap);
             // should normally return a fresh new bitmap *except* that
             // it can return x1.bitmap or x2.bitmap in degenerate cases
             bool temporary = !((newb == x1.bitmap) && (newb == x2.bitmap));
             uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
             roaring_pq_element_t newelement = {
                 .size = bsize, .is_temporary = temporary, .bitmap = newb};
             pq_add(pq, &newelement);
         } else if (x2.is_temporary) {
             roaring_bitmap_lazy_or_inplace(x2.bitmap, x1.bitmap, false);
             x2.size = roaring_bitmap_portable_size_in_bytes(x2.bitmap);
             pq_add(pq, &x2);
         } else if (x1.is_temporary) {
             roaring_bitmap_lazy_or_inplace(x1.bitmap, x2.bitmap, false);
             x1.size = roaring_bitmap_portable_size_in_bytes(x1.bitmap);

             pq_add(pq, &x1);
         } else {
             roaring_bitmap_t *newb =
                 roaring_bitmap_lazy_or(x1.bitmap, x2.bitmap, false);
             uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
             roaring_pq_element_t newelement = {
                 .size = bsize, .is_temporary = true, .bitmap = newb};

             pq_add(pq, &newelement);
         }
     }
     roaring_pq_element_t X = pq_poll(pq);
     roaring_bitmap_t *answer = X.bitmap;
     roaring_bitmap_repair_after_lazy(answer);
     pq_free(pq);
     return answer;
 }
 /* end file src/roaring_priority_queue.c */