add vector search

2022-06-01 17:54:56 +08:00 · 2022-06-01 17:54:56 +08:00 · 5bf083cee4
commit 5bf083cee4
parent e960e85e5b
801 changed files with 307516 additions and 5232 deletions
--- a/1011
+++ b/1011
--- a/README.md
+++ b/README.md
@ -1,3 +1,7 @@
+### v2.1重大更新！！！
+
+本次更新新增向量检索功能，赋能算法领域，详细介绍请参考[Vector.md](https://gitee.com/jd-platform-opensource/isearch/blob/master/docs/Vector.md)和[向量检索使用说明文档](https://gitee.com/jd-platform-opensource/isearch/blob/master/docs/Vector.doc)
+
 ### 项目背景与介绍

 本团队开发的检索服务提供分词和建立索引功能，可自定义中文词库，可满足复杂查询需求，支持包括字段检索、字段排序、布尔搜索、范围检索、地理位置（POI、AOI）查询等功能。
@ -6,6 +10,7 @@
 2)    C++开发，轻量化，4核8G内存的机器就能支持大规模存储。
 3)    支持存储节点分片数动态增加，方便扩容。
 4)    日志即存储。
+5)    支持向量检索。

 ### 体验demo项目

@ -19,6 +24,7 @@
 2.支持http方式进行数据导入和查询（需部署接入层服务），若通过sdk或tcp方式访问，则只需部署索引层服务即可  
 3.采用稳定高效的C++开发，高速搜索响应，架构简洁    
 4.提供了丰富的功能，开发周期更短，支持包括字段检索、字段排序、布尔搜索、范围检索、地理位置（POI、AOI）查询等功能  
+5.支持向量检索，请参考[Vector.md](https://gitee.com/jd-platform-opensource/isearch/blob/master/docs/Vector.md)

 ### 接口文档

@ -59,7 +65,7 @@ field_name：字段名称，由用户自行定义

 is_primary_key：该字段是否为主键，一般需要将文档id字段设置为主键，唯一标识一条记录

-field_type：字段类型，1:INT，2:SHORT_TEXT，3:TEXT，4:IP，5:GEO_POINT，9:DOUBLE，10:LONG，11:联合索引，14:GEO_SHAPE
+field_type：字段类型，1:INT，2:SHORT_TEXT，3:TEXT，4:IP，5:GEO_POINT，9:DOUBLE，10:LONG，11:联合索引，14:GEO_SHAPE，15:VECTOR

 index_tag：是否需要对该字段建索引

--- a/build.sh
+++ b/build.sh
@ -30,6 +30,7 @@ index_read="index_read"
 index_storage="index_storage"
 search_local="search_local"
 search_agent="search_agent"
+vector_index="vector_index_helper"

 src_common="$srcdir/$common"
 src_stat="$srcdir/$common/$stat"
@ -37,6 +38,7 @@ src_index_write="$srcdir/$search_local/$index_write"
 src_index_read="$srcdir/$search_local/$index_read"
 src_index_storage="$srcdir/$search_local/$index_storage"
 src_search_agent="$srcdir/$search_agent"
+src_vector_index="$srcdir/$search_local/$vector_index"

 cd $src_common
 cmake .
@ -64,4 +66,9 @@ cd $localdir
 cd $src_search_agent
 cmake .
 make
+cd $localdir
+
+cd $src_vector_index
+cmake .
+make
 cd $localdir
--- a/dockerfiles/Dockerfile
+++ b/dockerfiles/Dockerfile
@ -1,4 +1,4 @@
-FROM intelligentsearch/isearch_env:2.0
+FROM intelligentsearch/isearch_env
 COPY install.sh /root/install.sh
 COPY start.sh /root/start.sh
 RUN /root/install.sh
--- a/dockerfiles/env/Dockerfile
+++ b/dockerfiles/env/Dockerfile
@ -1,20 +1,40 @@
 FROM centos:centos7.2.1511
-RUN yum -y update
 RUN yum install -y wget pcre pcre-devel gcc gcc-c++ make zlib-devel sudo openssh-server vim lrzsz openssl-devel &&\
    yum install -y crypto-policies snappy-devel psmisc git epel-release jq && \
-    yum clean all && \
-    useradd --create-home --no-log-init --shell /bin/bash isearch && echo "isearch:isearch" | chpasswd && \
+    yum install -y autoconf automake libtool gcc-gfortran bzip2 &&\
+    yum clean all && \ 
+    cd /usr/local &&\
+    wget http://storage.jd.com/lbs-search-acc/gcc-4.9.3.tar.bz2 &&\
+    tar jxvf gcc-4.9.3.tar.bz2 &&\
+    cd gcc-4.9.3 &&\
+    wget http://storage.jd.com/lbs-search-acc/mpfr-2.4.2.tar.bz2  &&\
+    wget http://storage.jd.com/lbs-search-acc/gmp-4.3.2.tar.bz2  &&\
+    wget http://storage.jd.com/lbs-search-acc/mpc-0.8.1.tar.gz &&\
+    wget http://storage.jd.com/lbs-search-acc/isl-0.12.2.tar.bz2  &&\
+    wget http://storage.jd.com/lbs-search-acc/cloog-0.18.1.tar.gz &&\
+    wget http://storage.jd.com/lbs-search-acc/download_prerequisites &&\
+    mv download_prerequisites ./contrib/download_prerequisites &&\
+    chmod +x ./contrib/download_prerequisites &&\
+    ./contrib/download_prerequisites &&\
+    cd .. &&\
+    mkdir build-gcc &&\
+    cd build-gcc &&\
+    ../gcc-4.9.3/configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++ --enable-plugin --enable-initfini-array --disable-libgcj --enable-gnu-indirect-function --with-tune=generic --disable-multilib &&\
+    make -j8 &&\
+    make install
+RUN useradd --create-home --no-log-init --shell /bin/bash isearch && echo "isearch:isearch" | chpasswd && \
    mkdir -p /env/app &&\
    cd /env/app/ &&\
-    wget https://cmake.org/files/v3.20/cmake-3.20.0-rc3.tar.gz &&\
-    tar xf cmake-3.20.0-rc3.tar.gz &&\
-    cd /env/app/cmake-3.20.0-rc3 &&\
+    wget --no-check-certificate https://cmake.org/files/v3.21/cmake-3.21.2.tar.gz &&\
+    tar xf cmake-3.21.2.tar.gz &&\
+    cd /env/app/cmake-3.21.2 &&\
    ./bootstrap &&\
    gmake &&\
    gmake install &&\
    cd /usr/bin &&\
-    ln -s cmake3 cmake &&\
-    cd /usr/local &&\
+    ln -s cmake3 cmake
+
+RUN cd /usr/local &&\
    git clone https://github.com/facebook/rocksdb.git &&\
    cd rocksdb &&\
    git checkout -b 6.6.0 ad528fe5ca08dafff47d79c85abbf3e1fbb21568 &&\
@ -29,4 +49,30 @@ RUN yum install -y wget pcre pcre-devel gcc gcc-c++ make zlib-devel sudo openssh
    ldconfig &&\
    ln -s /usr/local/lib/libgflags.so.2.2 /lib64

+RUN yum install -y unzip gcc-gfortran && cd /usr/local &&\
+    wget http://storage.jd.com/lbs-search-acc/protobuf-3.12.2.zip &&\
+    unzip protobuf-3.12.2.zip &&\
+    cd protobuf-3.12.2 &&\
+    ./autogen.sh &&\
+    ./configure &&\
+    make -j8 && make install &&\
+    ln -s /usr/local/lib/libprotobuf.so.23.0.2 /lib64/libprotobuf.so.23 &&\
+    ln -s /usr/local/lib/libprotoc.so.23.0.2 /lib64/libprotoc.so.23

+RUN yum install openblas-devel.x86_64 -y && cd /usr/local &&\
+    wget http://storage.jd.com/lbs-search-acc/lapack-3.10.0.tar.gz &&\
+    tar zxvf lapack-3.10.0.tar.gz &&\
+    cd lapack-3.10.0 &&\
+    mkdir build &&\
+    cd build &&\
+    cmake -DCMAKE_INSTALL_LIBDIR=$HOME/.local/lapack .. &&\
+    cmake --build . -j --target install &&\
+    ln -s /usr/local/lapack-3.10.0/build/lib/liblapack.a /lib64/ &&\
+    cd /usr/local &&\
+    wget http://storage.jd.com/lbs-search-acc/faiss-main.zip &&\
+    unzip faiss-main.zip &&\
+    cd faiss-main &&\
+    cmake -B build . -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_PYTHON=OFF -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DBLA_VENDOR=Generic -DBLAS_LIBRARIES=/lib64/libopenblas.so -DLAPACK_LIBRARIES=/lib64/liblapack.a &&\
+    cd build/ &&\
+    make -j8 && make install &&\
+    ln -s /usr/local/faiss-main/build/faiss/libfaiss.so /lib64/
--- a/dockerfiles/install.sh
+++ b/dockerfiles/install.sh
@ -43,7 +43,7 @@ chmod +x dtcd.sh

 ln -s $dstdir/src/search_local/index_storage/api/c_api_cc/libdtc-gcc-4.8-r4646582.so /lib64/libdtc.so.1
 cd $dstdir
-mkdir index_write index_read search_agent
+mkdir index_write index_read search_agent vector_index_helper
 cd index_write
 mkdir log bin stat conf
 cp $srcdir/resource/index_write/conf/{index_gen.json,index_write.conf,localCluster.json} conf
@ -63,6 +63,14 @@ mkdir log bin conf
 cp $srcdir/resource/search_agent/conf/sa.conf conf
 cp $srcdir/src/search_agent/bin/search_agent bin/

+cd $dstdir/vector_index_helper
+mkdir log bin conf rocksdb
+cp $srcdir/resource/app_field_define.json conf
+cp $srcdir/resource/vector_index/conf/vector_index.conf conf
+cp $srcdir/resource/vector_index/conf/ivfflat.faissindex /tmp
+cp $srcdir/resource/vector_index/conf/trained_index.faissindex /tmp
+cp $srcdir/src/search_local/vector_index_helper/bin/vector_index bin/
+
 yum install -y jq
 cd /usr/local/isearch
 mkdir tools
--- a/dockerfiles/start.sh
+++ b/dockerfiles/start.sh
@ -10,6 +10,9 @@ INIT(){
 	./dtcd.sh start
 	cd /usr/local/isearch/index_storage/original_data/bin
 	./dtcd.sh start
+	cd /usr/local/isearch/vector_index_helper/bin
+	./vector_index
+	sleep 1
 	cd /usr/local/isearch/index_write/bin
 	./index_write
 	cd /usr/local/isearch/index_read/bin
--- a/docs/Quickstart.md
+++ b/docs/Quickstart.md
@ -1,10 +1,10 @@
 ## 源码编译方式（推荐）

-建议通过isearch_env镜像进行源码编译，获取isearch_env镜像的方式为： `docker pull intelligentsearch/isearch_env:2.0`
+建议通过isearch_env镜像进行源码编译，获取isearch_env镜像的方式为： `docker pull intelligentsearch/isearch_env`

-也可以自行编译isearch_env镜像，Dockerfile文件位于dockerfiles\env目录： `docker build -t intelligentsearch/isearch_env:2.0 .`
+也可以自行编译isearch_env镜像，Dockerfile文件位于dockerfiles\env目录： `docker build -t intelligentsearch/isearch_env .`

-然后运行容器： `docker run -itd intelligentsearch/isearch_env:2.0`
+然后运行容器： `docker run -itd intelligentsearch/isearch_env`

 进入容器： `docker exec -it 容器id /bin/bash`

--- a/docs/Vector.doc
+++ b/docs/Vector.doc
--- a/docs/Vector.md
+++ b/docs/Vector.md
@ -0,0 +1,50 @@
+## 背景
+
+向量检索是指用一组数字(向量)来量化一个事物，用大量向量来表示事物集合，用向量计算的方式寻找相似事物的一种检索方式。
+
+isearch底层采用的向量检索框架为Facebook AI的Faiss，项目地址为：https://github.com/facebookresearch/faiss 
+
+## app_field_define表
+
+在app_field_define表定义时，vector字段类型需定义好dim、index_type和metric_type三个属性，示例如下：
+```
+{
+     "id":3,
+     "appId":10065,
+     "fieldName":"float_vector",
+     "fieldType":15,
+     "fieldId":3,
+     "IsPrimaryKey":0,
+     "indexTag":0,
+     "snapshotTag":1,
+     "segmentTag":0,
+     "segmentFeature":0,
+     "unionField":"",
+     "createTime":"2021/4/13 15:49:09",
+     "dim":128, // 维数
+     "index_type": [" PCA80,Flat "], // 索引类型，格式与faiss对外工厂类设置保持一致
+     "metric_type": "L2" // 距离计算方式，可选值：InnerProduct、L2
+}
+
+```
+
+说明：index_type参考https://github.com/facebookresearch/faiss/wiki/The-index-factory
+
+## 向量插入示例
+
+```
+curl -X POST \
+  http://127.0.0.1/insert \
+  -H 'content-type: application/json' \
+  -H 'doc_id: 1' \
+  -d '{"appid":10065,"table_content":{"cmd":"add","fields":{"doc_id":"1","random_value":1488981884,"float_vector":[0.005653954876242762, 0.632130963117687, 0.7519577013172226, 0.8568273368123129, 0.2034335192251041, 0.9786219451736441, 0.5948105950093241, 0.9618089054657426]}}}'
+```
+
+## 向量查询示例
+
+```
+curl -X POST \
+  http://127.0.0.1/search \
+  -H 'content-type: application/json' \
+  -d '{"appid":10065,"query":{"vector_query":{"float_vector":[0.005653954876242762, 0.632130963117687, 0.7519577013172226, 0.8568273368123129, 0.2034335192251041, 0.9786219451736441, 0.5948105950093241, 0.9618089054657426], "index_type_id":1}} }'
+```
--- a/install.sh
+++ b/install.sh
@ -50,8 +50,18 @@ chmod +x dtcd.sh

 ln -s $dstdir/src/search_local/index_storage/api/c_api_cc/libdtc-gcc-4.8-r4646582.so /lib64/libdtc.so.1
 cd $dstdir
-mkdir index_write index_read search_agent
-cd index_write
+mkdir index_write index_read search_agent vector_index_helper
+cd $dstdir/vector_index_helper
+mkdir log bin conf rocksdb
+cp $srcdir/resource/app_field_define.json conf
+cp $srcdir/resource/vector_index/conf/vector_index.conf conf
+cp $srcdir/resource/vector_index/conf/ivfflat.faissindex /tmp
+cp $srcdir/resource/vector_index/conf/trained_index.faissindex /tmp
+cp $srcdir/src/search_local/vector_index_helper/bin/vector_index bin/
+cd bin
+./vector_index
+
+cd $dstdir/index_write
 mkdir log bin stat conf
 cp $srcdir/resource/index_write/conf/{index_gen.json,index_write.conf,localCluster.json} conf
 cp $srcdir/resource/{app_field_define.json,character_map.txt,msr_training.utf8,phonetic_base.txt,phonetic_map.txt,stop_words.dict,words_base.txt} conf
--- a/resource/app_field_define.json
+++ b/resource/app_field_define.json
--- a/resource/index_read/conf/index_read.conf
+++ b/resource/index_read/conf/index_read.conf
@ -1,66 +1,66 @@
 {
-	"service_name":"index_read",
-	"pid_file" : "index_read.pid",
-	"log" : "../log/",
-	"log_level" : 7,
-	"daemon": true,
-	"time_interval": 3600,
-        "update_interval": 100000,
-        "cache_max_slot": 100000,
-        "index_cache_max_slot": 1000,
-	"cache_expire_time": 3600,
-	"vague_switch": 1,
-        "jdq_switch": 0,
-	"listen_addr": "127.0.0.1",
-	"listen_port": 12003,
-	"dtc_config" : 
-	{
-		"table_name": "keyword_index_data",
-		"accesskey": "00002153a746aac69b2ca06f79f8154e3a03ad04",
-		"timeout": 1000,
-		"keytype": 4,
-		"route":
-		[
-			{
-				"ip": "127.0.0.1",
-				"bid": 2153,
-				"port": 20000,
-				"weight": 1,
-				"status": 1
-			}
-		]
-	},
-	"hanpin_index_config" :
-	{
-		"table_name" : "hanpin_index_data",
-		"accesskey" : "000021140f992fd33e6019ce2c66cd8614925953",
-		"timeout" : 1000,
-		"keytype" : 4,
-		"route":
-		[
-			{
-				"ip": "127.0.0.1",
-				"bid": 2114,
-				"port": 20005,
-				"weight": 1,
-				"status": 1
-			}
-		]
-	},
-	"phonetic_file" : "../conf/phonetic_map.txt",
-	"character_file" : "../conf/character_map.txt",
-	"intelligent_file" : "../conf/intelligent_match.txt",
-	"en_intelligent_file" : "../conf/en_intelligent_match.txt",
-	"words_file" : "../conf/words_base.txt",
-	"en_words_file" : "../conf/en_words_base.txt",
-	"phonetic_base_file" : "../conf/phonetic_base.txt",
-	"suggest_file" : "../conf/suggest_base.txt",
-        "training_file" : "../conf/msr_training.utf8",
-	"relate_file" : "../data/relate_data",
-	"synonym_file" : "../data/synonym_data",
-	"analyze_file" : "../data/analyze_data",
-	"sensitive_file" : "../data/sensitive_data",
-        "app_filed_file" : "../conf/app_field_define.txt",
-	"app_info" : [{"app_id":10001, "cache_switch":0, "en_query_switch":1}],
-	"split_mode": "Cache"
+    "service_name":"index_read",
+    "pid_file" : "index_read.pid",
+    "log" : "../log/",
+    "log_level" : 7,
+    "daemon": true,
+    "time_interval": 3600,
+    "update_interval": 100000,
+    "cache_max_slot": 100000,
+    "index_cache_max_slot": 1000,
+    "cache_expire_time": 3600,
+    "vague_switch": 1,
+    "jdq_switch": 0,
+    "listen_addr": "127.0.0.1",
+    "listen_port": 12003,
+    "dtc_config" : 
+    {
+        "table_name": "keyword_index_data",
+        "accesskey": "00002153a746aac69b2ca06f79f8154e3a03ad04",
+        "timeout": 1000,
+        "keytype": 4,
+        "route":
+        [
+            {
+                "ip": "127.0.0.1",
+                "bid": 2153,
+                "port": 20000,
+                "weight": 1,
+                "status": 1
+            }
+        ]
+    },
+    "hanpin_index_config" :
+    {
+        "table_name" : "hanpin_index_data",
+        "accesskey" : "000021140f992fd33e6019ce2c66cd8614925953",
+        "timeout" : 1000,
+        "keytype" : 4,
+        "route":
+        [
+            {
+                "ip": "127.0.0.1",
+                "bid": 2114,
+                "port": 20005,
+                "weight": 1,
+                "status": 1
+            }
+        ]
+    },
+    "phonetic_file" : "../conf/phonetic_map.txt",
+    "character_file" : "../conf/character_map.txt",
+    "intelligent_file" : "../conf/intelligent_match.txt",
+    "en_intelligent_file" : "../conf/en_intelligent_match.txt",
+    "words_file" : "../conf/words_base.txt",
+    "en_words_file" : "../conf/en_words_base.txt",
+    "phonetic_base_file" : "../conf/phonetic_base.txt",
+    "suggest_file" : "../conf/suggest_base.txt",
+    "training_file" : "../conf/msr_training.utf8",
+    "relate_file" : "../data/relate_data",
+    "synonym_file" : "../data/synonym_data",
+    "analyze_file" : "../data/analyze_data",
+    "sensitive_file" : "../data/sensitive_data",
+    "app_filed_file" : "../conf/app_field_define.json",
+    "app_info" : [{"app_id":10001, "cache_switch":0, "en_query_switch":1}],
+    "split_mode": "Cache"
 }
--- a/resource/index_write/conf/index_write.conf
+++ b/resource/index_write/conf/index_write.conf
@ -1,70 +1,70 @@
 {
-	"program_name" : "index_write v1.0",
-	"pid_file" : "index_write.pid",
-	"log" : "../log/",
-	"log_level" : 7,
-	"daemon": true,
-	"listen_addr": "*:11017/tcp",
-	"timeout": 6000,
-	"stop_words_path":"../conf/stop_words.dict",
-	"training_path":"../conf/msr_training.utf8",
-	"words_base_path":"../conf/words_base.dict",
-	"words_file":"../conf/words_base.txt",
-        "character_path":"../conf/character_map.txt",
-        "phonetic_path":"../conf/phonetic_map.txt",
-        "phonetic_base_file" : "../conf/phonetic_base.txt",
-        "app_filed_file" : "../conf/app_field_define.txt",
-	"service_type":"index_gen",
-	"dtc_index_config" : 
-	{
-		"table_name": "keyword_index_data",
-		"accesskey": "00002153a746aac69b2ca06f79f8154e3a03ad04",
-		"timeout": 4000,
-		"keytype": 4,
-		"route":
-		[
-			{
-				"ip": "127.0.0.1",
-				"bid": 2153,
-				"port": 20000,
-				"weight": 1,
-				"status": 1
-			}
-		]
-	},
-	"dtc_intelligent_config" : 
-	{
-		"table_name": "hanpin_index_data",
-		"accesskey": "000021140f992fd33e6019ce2c66cd8614925953",
-		"timeout": 4000,
-		"keytype": 4,
-		"route":
-		[
-			{
-				"ip": "127.0.0.1",
-				"bid": 2114,
-				"port": 20005,
-				"weight": 1,
-				"status": 1
-			}
-		]
-	},
-        "dtc_original_config" : 
-	{
-		"table_name": "index_original_data",
-		"accesskey": "00002263b51c22d9b7f1a68dd023ff625cbabda0",
-		"timeout": 4000,
-		"keytype": 2,
-		"route":
-		[
-			{
-				"ip": "127.0.0.1",
-				"bid": 2263,
-				"port": 20070,
-				"weight": 1,
-				"status": 1
-			}
-		]
-	},
-        "split_mode":"Post"
+    "program_name" : "index_write v1.0",
+    "pid_file" : "index_write.pid",
+    "log" : "../log/",
+    "log_level" : 7,
+    "daemon": true,
+    "listen_addr": "*:11017/tcp",
+    "timeout": 6000,
+    "stop_words_path":"../conf/stop_words.dict",
+    "training_path":"../conf/msr_training.utf8",
+    "words_base_path":"../conf/words_base.dict",
+    "words_file":"../conf/words_base.txt",
+    "character_path":"../conf/character_map.txt",
+    "phonetic_path":"../conf/phonetic_map.txt",
+    "phonetic_base_file" : "../conf/phonetic_base.txt",
+    "app_filed_file" : "../conf/app_field_define.txt",
+    "service_type":"index_gen",
+    "dtc_index_config" : 
+    {
+        "table_name": "keyword_index_data",
+        "accesskey": "00002153a746aac69b2ca06f79f8154e3a03ad04",
+        "timeout": 4000,
+        "keytype": 4,
+        "route":
+        [
+            {
+                "ip": "127.0.0.1",
+                "bid": 2153,
+                "port": 20000,
+                "weight": 1,
+                "status": 1
+            }
+        ]
+    },
+    "dtc_intelligent_config" : 
+    {
+        "table_name": "hanpin_index_data",
+        "accesskey": "000021140f992fd33e6019ce2c66cd8614925953",
+        "timeout": 4000,
+        "keytype": 4,
+        "route":
+        [
+            {
+                "ip": "127.0.0.1",
+                "bid": 2114,
+                "port": 20005,
+                "weight": 1,
+                "status": 1
+            }
+        ]
+    },
+    "dtc_original_config" : 
+    {
+        "table_name": "index_original_data",
+        "accesskey": "00002263b51c22d9b7f1a68dd023ff625cbabda0",
+        "timeout": 4000,
+        "keytype": 2,
+        "route":
+        [
+            {
+                "ip": "127.0.0.1",
+                "bid": 2263,
+                "port": 20070,
+                "weight": 1,
+                "status": 1
+            }
+        ]
+    },
+    "split_mode":"Post"
 }
--- a/resource/tools/search.json
+++ b/resource/tools/search.json
@ -10,3 +10,4 @@
 {"appid":10010,"query":{"bool":{"must":{"match":{"birthPlace":"上海市"},"term":{"gender":"男"}}}}}
 {"appid":10010,"query":{"match":{"dreamPlace":"阿姆斯"}},"fields":"birthPlace,homeAddress,dreamPlace,name","page_index":1,"page_size":3}
 {"appid":10010,"query":{"range":{"height":{"gte":174 ,"lte": 180}}},"fields":"birthPlace,height,name","sort_type":"5","sort_field":"height"}
+{"appid":10008,"page_size":20,"sort_type":6,"query":{"vector_query":{"vectorTest":[0.1354770042967805,0.8350085899945795,0.96886777112423139,0.2210340429827049,0.30816705050700327,0.54722059636785192,0.1883819760471811,0.99288130191780666,0.9964613255480087,0.96769493701050258,0.72583896321188968,0.98110969177693896,0.10986175084420642,0.79810585674954948,0.29702944955795085,0.0047834844193156683,0.1124645160561803,0.6397633570981528,0.87843064539884386,0.5036626777051697,0.79792861516022606,0.36129400134918088,0.21192433239173361,0.68135953856026599,0.39873851991229114,0.7406472446764214,0.47475868061723475,0.42208768110541323,0.17386517200048032,0.30191312687731969,0.79727991523827568,0.31655044481899425,0.87242882006730027,0.1491139764073704,0.99406849432204192,0.82190326480741094,0.12518276453363444,0.76375001257217945,0.49058903962146072,0.66360552050975297,0.12589663347200125,0.21020907451900617,0.051216425785216686,0.036441251587867714,0.40873116096176038,0.4579891554288949,0.48756892686839826,0.79397497154919272,0.92087479115216175,0.8075310254364011,0.70577425166871988,0.0028184325619839781,0.71070387509071686,0.64396095652194041,0.45603282449743654,0.77391712891365494,0.57375466659659147,0.87675741509077743,0.80817549014121004,0.017773895576552474,0.82124599156697908,0.82084078417511075,0.94007402879790336,0.41266651491147388,0.42316511643373017,0.58095667766390635,0.15805758455470567,0.76173121368946151,0.23015606453392981,0.80973454873485218,0.98852160080352736,0.33244828233827889,0.29983170582314134,0.013539126665220821,0.21723783945880448,0.90736471776617311,0.84846779196443856,0.95501757349145688,0.7788977100551232,0.98745962685749145,0.067595381138767008,0.79359758152412918,0.59450356117106606,0.73279872526175427,0.6952328837749534,0.67981979071298593,0.3923204691980966,0.56155744235816618,0.2080680570519636,0.52737145860861556,0.404208518116701,0.35276240810145393,0.59282387851988838,0.35634516058004628,0.96496637210576108,0.1544384174351362,0.39490821062792641,0.38729590514427764,0.72695472161562691,0.38856980747941944,0.92749284165030865,0.43611756497860993,0.86267818704042931,0.62036001322058332,0.11954718110283109,0.47195680174091409,0.34021969914039862,0.52984198849976039,0.71610070967330008,0.98837939760371851,0.7204934613431202,0.91257749197162252,0.50549850525077833,0.55826875635267637,0.50319002106608313,0.46247420642194731,0.54659196188169124,0.44758440204806038,0.85445098673882325,0.60423148433740215,0.4985441863785155,0.9799256047610887,0.034317313228881506,0.97700203460988366,0.36318646087714079,0.67951969380227262,0.34623339308529066,0.85587513642380542]}}}
--- a/resource/tools/send.json
+++ b/resource/tools/send.json
@ -2,3 +2,13 @@
 {"appid":10010,"table_content":{"cmd":"add","fields":{"doc_id":"2","birthPlace":"中华人民共和国湖北省武汉市","homeAddress":"中华人民共和国上海市长宁区","dreamPlace":"比利时王国","name":"Joy","gender":"男","year":20,"height":174.325,"brithday":19910720,"ip":"192.168.0.35","currentLocation":"39.452, -76.589","preLocation":{"latitude":"-70.154","longitude":"35.247"},"postLocation":["-75.456","40.111"],"currentShape":"POLYGON((121.437271 31.339747, 121.438022 31.337291, 121.435297 31.336814, 121.434524 31.339252, 121.437271 31.339747))"}}}
 {"appid":10010,"table_content":{"cmd":"add","fields":{"doc_id":"3","birthPlace":"中华人民共和国江苏省苏州市","homeAddress":"中华人民共和国上海市闵行区","dreamPlace":"东京郊外调布市","name":"Tom","gender":"男","year":30,"height":180.785,"brithday":19900654,"ip":"192.168.0.98","currentLocation":"34.452, -65.589","preLocation":{"latitude":"-68.355","longitude":"45.121"},"postLocation":["-71.456","27.986"],"currentShape":"POLYGON((121.437271 31.339747, 121.438022 31.337291, 121.435297 31.336814, 121.434524 31.339252, 121.437271 31.339747))"}}}
 {"appid":10010,"table_content":{"cmd":"add","fields":{"doc_id":"4","birthPlace":"中华人民共和国上海市","homeAddress":"中华人民共和国上海市宝山区","dreamPlace":"梵蒂冈高地","name":"Amy","gender":"女","year":40,"height":176.258,"brithday":19931124,"ip":"192.168.0.18","currentLocation":"39.452, -76.589","preLocation":{"latitude":"-70.154","longitude":"35.247"},"postLocation":["-75.456","40.111"],"currentShape":"POLYGON((121.437271 31.339747, 121.438022 31.337291, 121.435297 31.336814, 121.434524 31.339252, 121.437271 31.339747))"}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12346","vectorTest":[0.1354770042967805,0.8350085899945795,0.96886777112423139,0.2210340429827049,0.30816705050700327,0.54722059636785192,0.1883819760471811,0.99288130191780666,0.9964613255480087,0.96769493701050258,0.72583896321188968,0.98110969177693896,0.10986175084420642,0.79810585674954948,0.29702944955795085,0.0047834844193156683,0.1124645160561803,0.6397633570981528,0.87843064539884386,0.5036626777051697,0.79792861516022606,0.36129400134918088,0.21192433239173361,0.68135953856026599,0.39873851991229114,0.7406472446764214,0.47475868061723475,0.42208768110541323,0.17386517200048032,0.30191312687731969,0.79727991523827568,0.31655044481899425,0.87242882006730027,0.1491139764073704,0.99406849432204192,0.82190326480741094,0.12518276453363444,0.76375001257217945,0.49058903962146072,0.66360552050975297,0.12589663347200125,0.21020907451900617,0.051216425785216686,0.036441251587867714,0.40873116096176038,0.4579891554288949,0.48756892686839826,0.79397497154919272,0.92087479115216175,0.8075310254364011,0.70577425166871988,0.0028184325619839781,0.71070387509071686,0.64396095652194041,0.45603282449743654,0.77391712891365494,0.57375466659659147,0.87675741509077743,0.80817549014121004,0.017773895576552474,0.82124599156697908,0.82084078417511075,0.94007402879790336,0.41266651491147388,0.42316511643373017,0.58095667766390635,0.15805758455470567,0.76173121368946151,0.23015606453392981,0.80973454873485218,0.98852160080352736,0.33244828233827889,0.29983170582314134,0.013539126665220821,0.21723783945880448,0.90736471776617311,0.84846779196443856,0.95501757349145688,0.7788977100551232,0.98745962685749145,0.067595381138767008,0.79359758152412918,0.59450356117106606,0.73279872526175427,0.6952328837749534,0.67981979071298593,0.3923204691980966,0.56155744235816618,0.2080680570519636,0.52737145860861556,0.404208518116701,0.35276240810145393,0.59282387851988838,0.35634516058004628,0.96496637210576108,0.1544384174351362,0.39490821062792641,0.38729590514427764,0.72695472161562691,0.38856980747941944,0.92749284165030865,0.43611756497860993,0.86267818704042931,0.62036001322058332,0.11954718110283109,0.47195680174091409,0.34021969914039862,0.52984198849976039,0.71610070967330008,0.98837939760371851,0.7204934613431202,0.91257749197162252,0.50549850525077833,0.55826875635267637,0.50319002106608313,0.46247420642194731,0.54659196188169124,0.44758440204806038,0.85445098673882325,0.60423148433740215,0.4985441863785155,0.9799256047610887,0.034317313228881506,0.97700203460988366,0.36318646087714079,0.67951969380227262,0.34623339308529066,0.85587513642380542]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12347","vectorTest":[0.045059544094808029,0.66011949437672002,0.74994097310849783,0.13299603129519327,0.9823605637489462,0.095355174332811252,0.28267329073174369,0.80211146924087506,0.077557029404978314,0.62738433796439896,0.0080939118167991288,0.68028705943025525,0.53393310793978888,0.43866683184682742,0.19955121640836643,0.13800132612540608,0.3823329369541742,0.76242128481994542,0.040471111480127123,0.25295597410838094,0.50477099469488207,0.82260495173743753,0.98172312239587001,0.82345541451351123,0.30182730823352844,0.047944285428334774,0.2478476025841348,0.54405610181606667,0.88772613283659318,0.34383953593206912,0.033268568191967621,0.1628721235578886,0.87736391457269247,0.21030186894110842,0.27275303663725298,0.49244198864416966,0.22341994244988414,0.49030149638000253,0.95494337425149289,0.65196867193005381,0.75750363608245008,0.43524568599177982,0.55288120830310039,0.053152570350365379,0.32251074417056375,0.40498138359175145,0.90843361477437978,0.80913724311836766,0.25829589777838335,0.12984652508323136,0.49332685061910792,0.37850042885423724,0.71846994979935908,0.28780498432375512,0.62343557708540298,0.81187398084182172,0.31250797973172167,0.38571052187669819,0.34392975162912265,0.8157690646186514,0.66928513715514626,0.37941883669261023,0.45849690049173913,0.3036139714009391,0.91056498617091952,0.48861771459594799,0.75578989634448279,0.10806191556769178,0.39358985203050745,0.87018676533148898,0.36086071348228205,0.91711791576697288,0.54380550320360765,0.14014383786791224,0.19987287018717742,0.94892508665398356,0.99010995480947361,0.24007594979810565,0.016520584438914156,0.38861517225162973,0.77968936730699501,0.47663807160850724,0.54013800579303461,0.018450843473255246,0.90018319838468996,0.19449538370372918,0.88599811909820625,0.44122346425307085,0.14782900997109752,0.23950246352951696,0.7996531751071253,0.47301506028410484,0.089823156434833601,0.64455053371455973,0.63306366499285383,0.58438223325293892,0.72665438123626658,0.35463813319330767,0.68040664457689737,0.70732152371460555,0.16232851828869077,0.13373638466203583,0.44955607238106804,0.042054125401706374,0.7973641065421615,0.16755578986164549,0.83121428736991876,0.32496353567517772,0.65577989753723975,0.42018995162071154,0.78190915180001563,0.1131925360550774,0.99353471796156745,0.18157297914439866,0.76293141692900435,0.26592065417888117,0.24413826431353905,0.10073849156739444,0.34432791883326347,0.28162734107545695,0.96864037547197279,0.21377273869208266,0.60592821006735054,0.22655131983263552,0.18990550179060917,0.40157147828293832,0.41329063067419869,0.17015227876232472]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12348","vectorTest":[0.29905181556919952,0.63646856844320787,0.15157415842810568,0.76924317911918783,0.88396860077924533,0.95922160541759061,0.018066270256714624,0.5828463846808728,0.76685408551681611,0.32779776520604992,0.12906616421773956,0.29196078415953741,0.84710964889651008,0.23076439269846946,0.50197438548942142,0.0015979040539258519,0.012538803390733417,0.28658657725412673,0.098699042041561824,0.38475520697043292,0.35534869178452588,0.71997093580061644,0.065445064729718921,0.51385918636755312,0.42397833313572708,0.68980125937240067,0.25394310627598837,0.40151955920269761,0.2644765516521228,0.93425058634250113,0.47252524146290181,0.6109161944732634,0.48840188579761645,0.51451956703716928,0.24501924896215146,0.33300524070857374,0.50236237628537039,0.59605793352906455,0.3101559978653447,0.96299752108005976,0.23240035469147391,0.74690420639531097,0.3225329591660927,0.31823214107916886,0.95247203392433444,0.53472104568080325,0.87180515263286218,0.93460264989907049,0.66711408307814268,0.28968940561095485,0.065524461636820006,0.23188861618227025,0.70472372037415387,0.19510740115071307,0.42221838879855472,0.93605506408926908,0.14221720443409275,0.64318100471258899,0.47273213947199533,0.43693291057222472,0.24431964020067629,0.50591941523023731,0.9069440930810454,0.076503973485243279,0.71390392053252749,0.61720491675449018,0.18908419456132217,0.39698180621568213,0.27766405722602999,0.8580993014386058,0.90893657471213396,0.61727908496204753,0.18081870271285527,0.51502371966558824,0.82463170641380923,0.35932443766660316,0.81611702079313975,0.29338824376092842,0.17211781096811601,0.49278958726353395,0.19532476933030637,0.86430749231905635,0.20419716683441952,0.61627227611362312,0.87805055080718142,0.39246908666573743,0.94657725516070546,0.1666982685536747,0.97187740059316141,0.56396489460552945,0.59836891189866492,0.13122185038932063,0.5667773954145946,0.55438027088135766,0.31048996055066935,0.12064893094382685,0.73779082011113384,0.24272365252001363,0.82285914726045573,0.1854759712392762,0.31267693852496625,0.88172541468590837,0.67421294609063309,0.73929755673071396,0.062543240383407292,0.13156156161827962,0.089952908661651079,0.0083144453049579896,0.45651041185260594,0.63430332075080753,0.52976299227697909,0.49786942847576876,0.33579789850777131,0.25282334928762235,0.23367544138915011,0.17316303178115566,0.59598103030373428,0.7268931438280859,0.87757309281314311,0.70715972667574223,0.23476560334430716,0.47799574635601316,0.018211762976045518,0.74733119867345932,0.65472358145094622,0.078066942349644536,0.47276910082835027,0.99647506705142574]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12349","vectorTest":[0.53209752212846095,0.89943600631846665,0.77290092728339344,0.20375969594900425,0.90710862227344635,0.37481256134523289,0.84808888796398141,0.3266992459259121,0.32673666016612213,0.65921061981020679,0.58676498234140595,0.048983202321108567,0.76996566551394274,0.49175217103043545,0.48818401507136522,0.42912657074135874,0.3947277153895436,0.77645380145270126,0.9747543864880045,0.74850515939083984,0.6345020038299144,0.072180258738782227,0.63336329689215187,0.52892063256204214,0.29616765120737398,0.32532786911344147,0.63111992425538233,0.99450542257503738,0.516047826830209,0.72445391370448775,0.24557474831080972,0.33233911566637675,0.74860911549332876,0.68296648546241823,0.68127137052491982,0.078842372310105394,0.45893628750434196,0.69814685928250031,0.067884831477613955,0.069140052950585146,0.52704281782583717,0.26204879867867692,0.48337516245434181,0.13080546435135015,0.094440768579352727,0.12038168191802026,0.96983720583795852,0.75628164548976873,0.26817637859790333,0.72490598353503599,0.41576251067237624,0.47471840183160319,0.055441426413381609,0.61374414308737224,0.49061918484399575,0.90447486705798485,0.93521427019843606,0.91827661968396046,0.30382527123853392,0.79010177679462179,0.22622740788954832,0.44895186246850521,0.5101089129865477,0.2306319254000474,0.7118807799653547,0.98445260976133198,0.6552132481979186,0.27266698284101465,0.46901556324736426,0.0030012052562900463,0.44642393301728095,0.97094415029747405,0.30466350012516413,0.21510904463584052,0.24758507205903463,0.8064510265914171,0.48637109859284816,0.40750191563781307,0.21562929415923682,0.54025208697484839,0.72302423751473532,0.28369391607292677,0.54286209731170565,0.7559246848659652,0.33197728643260188,0.80176297340482106,0.8570724318896098,0.014891700298874645,0.88494266527440912,0.59308694076898927,0.08621722904292714,0.92487594242585114,0.91371957205314269,0.98075677513764081,0.90002210937108118,0.94971458396046482,0.93452195465742094,0.0083242709932422158,0.77779403110867984,0.025958877742563866,0.58787704962513754,0.64978077069606677,0.32823645491996561,0.11107936201603065,0.12427265769972091,0.97391505899143815,0.53322947059262671,0.66230028184641732,0.7082576325094061,0.068738579831602517,0.20595557840122133,0.43342967667061805,0.011192586468981847,0.069262525726323201,0.34306180995340479,0.31223008050896711,0.992742614956165,0.26906433620973008,0.85706624129552889,0.35561091505567988,0.74676827977366611,0.84485626968536864,0.21713862333908432,0.96495217914822318,0.71219265309149937,0.98266786794162286,0.75989900495501139,0.19326037486254038]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12350","vectorTest":[0.35258951120634702,0.19630034168275226,0.83206792723445577,0.95259476709033009,0.76737005175526174,0.86196880929772701,0.45566373740083016,0.33268383938437129,0.83825996341290421,0.55430878965379038,0.12086557279476341,0.21218372969038427,0.49123925544039726,0.40965479770022917,0.75951148963832615,0.35706422593741233,0.77178605580516824,0.98154040522381525,0.84616172213486074,0.83462162550860552,0.36719448154918022,0.05062594416792137,0.97556160014614923,0.84258929348663736,0.86946038841117557,0.075498432542996277,0.032376068351151872,0.61393180618063836,0.73568028568935406,0.61552780427737097,0.66263007945057284,0.95126442640604114,0.26588751469842486,0.91938572650657346,0.11839654767950655,0.31306570592392291,0.8721153998486223,0.23364369626139017,0.80805849464059343,0.55653176675427418,0.94359231057656368,0.50370346282237377,0.66641682513111866,0.152965732621862,0.9732287748090126,0.63376690736606478,0.17864978296455705,0.074142804182379923,0.49497478032059061,0.78597990736130663,0.46467973894620568,0.88303166101481367,0.6473688994928064,0.90324096949696342,0.0059736147077088485,0.69356180838194603,0.58508964450503598,0.10536129242182218,0.9903198966734128,0.22999202528719415,0.33290783370089844,0.68938285128010068,0.96142074592379967,0.22659909325500499,0.9242414058579258,0.87624619113450253,0.43174452913505501,0.47088740858944883,0.5621024799799601,0.83761541216733892,0.14830887913425128,0.63927818279591209,0.99559368710030405,0.63939410341134173,0.43951832473946401,0.028388349997248458,0.083150901549952805,0.53036718520301496,0.78200151364646486,0.73664522522360798,0.19016096743561928,0.26182726354717567,0.45224476831002447,0.46002688479963155,0.67363714314757628,0.17695307230926255,0.18783083538236289,0.096486812470241948,0.32387460718180816,0.17032649497514579,0.02473407283649055,0.83064298464373387,0.24482838427247269,0.14203500743382547,0.155441967742288,0.30301072471516943,0.033572317566844256,0.40928938190113723,0.78312364537561463,0.44337191176366764,0.017488211050555126,0.28578436548810532,0.95139670111270336,0.17131502419427105,0.094149831099923309,0.2739117480583611,0.20136393596759561,0.75311635524155063,0.80087942088456832,0.58608418716722799,0.49699451966368002,0.59097324490230907,0.56879152289672819,0.18509800607011648,0.90165664014894153,0.024462146621605348,0.26437647428613903,0.78251526364644597,0.59970211907500148,0.55621543656602657,0.096532746434284625,0.92678704009859136,0.39839131711446285,0.039902940420634012,0.5851167777699211,0.413214935801061,0.12953341445998653,0.60893897507558692]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12351","vectorTest":[0.055406415963613273,0.84829575792650769,0.45206391858681827,0.48501437846913781,0.067446854282731991,0.22100588377787092,0.64436472637801667,0.9758318250981417,0.94934234351922842,0.73647249098995105,0.42085361228010498,0.68573383455841619,0.68621608359800945,0.23126626565009306,0.99477475306062657,0.43559307221756238,0.74379263345505187,0.33805053832454512,0.3629157686456575,0.98988325182469172,0.72507827801721458,0.83750665175969652,0.39803542472724762,0.85169574196309028,0.17160291510408146,0.94739425722706627,0.35089664593454906,0.38487863346010254,0.39973002751754338,0.71836642716219079,0.83591752963790977,0.95087424388203567,0.063981459267835467,0.66452567422344377,0.17631153579951739,0.37095806831981953,0.16839990685081802,0.11320269155597265,0.15257344276905091,0.39551678526416928,0.39647512806994878,0.0279644976983323,0.45003226768701349,0.098543784333021647,0.91601781127611137,0.46631701752984406,0.74912879517897013,0.72932927881213305,0.73630694653317386,0.65654828851078395,0.2158032653699469,0.051492646343747513,0.058698541004942192,0.42140682397906343,0.81041041558970084,0.31681912726092532,0.88409165024456293,0.40910812226314991,0.20156698907381579,0.025499661649159764,0.0664404424743337,0.67639679598018987,0.081075484246918739,0.32941153446352728,0.36957977620284993,0.79685747581001076,0.17648521779636109,0.092710447341862373,0.059614530973715274,0.28690663401925826,0.71598590002125551,0.49354184617753655,0.90512452640266094,0.40455306018690851,0.067376302855054235,0.37502464964392657,0.84857856795770115,0.36470809363852186,0.14261117992415187,0.42169955514842217,0.86896255779501907,0.99704139967440508,0.5336924886928649,0.39456635492897724,0.33226634593118193,0.043031781021428538,0.85651533100962329,0.18069310600523686,0.33331615982469898,0.43136435501940273,0.32746071762222156,0.2525902895964709,0.13156505155971882,0.090521716451334638,0.95214313933669159,0.27954441965189625,0.57815892193401497,0.35300998396704369,0.31696236625642582,0.77674011425408573,0.37017951600601467,0.19148647321189122,0.011024580859226772,0.39227447939425575,0.90667332742403306,0.98728317371878849,0.70892344342944946,0.63795277690437113,0.62105451992092897,0.92509477103766213,0.880747919775559,0.019452320901134378,0.72040518768955719,0.59107682396007999,0.056511152139247123,0.63877826581291042,0.29647154856897984,0.46455997904910501,0.9479332292697924,0.55193245708044292,0.50341031319253848,0.13333647653690811,0.71312535160241242,0.92510012718318979,0.90354881759386851,0.63629236154578905,0.34196903535843692,0.66365336925392249]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12352","vectorTest":[0.65961039283734224,0.27043822368000936,0.64228011661821827,0.42925378820168242,0.80221269164431075,0.33218356260748488,0.71050662351852933,0.65427794925079275,0.24065527243765758,0.91911724249535554,0.60383730399920066,0.38324548171087564,0.78012549613894244,0.49500474090610397,0.8850606087121462,0.39798760206009975,0.27793777053714158,0.24252637809551877,0.001491292986987604,0.39498723711381994,0.20199736629259149,0.77207475857061525,0.60845674447471743,0.77905694243619905,0.49939220582378929,0.38451120369112124,0.83073091454206527,0.12795719598683805,0.24966503634883214,0.50315260074299939,0.12821142650170969,0.97868662605597823,0.72686931506817121,0.22339902215918594,0.68257701856147601,0.44870802553145994,0.67205892552563284,0.9093655533252093,0.83876213881392359,0.21188940542140325,0.9510701731194583,0.87789161796290094,0.70907033857424562,0.97071340918420512,0.42290940478792355,0.83727640471667952,0.093319298668515474,0.67342663674760483,0.098370607329168497,0.69749160189395742,0.2159513171494539,0.77783001693849019,0.080736606798452307,0.40530116468846167,0.20749043980753698,0.0850326700803513,0.51918253436962469,0.97848730865066746,0.16609021261992671,0.91847447970690632,0.30475189937952751,0.58088937509850147,0.84387769538082191,0.38807225067909934,0.006013542362938629,0.0099198187469985172,0.19837282209053317,0.39338207855759494,0.19652059267528174,0.83525831696845099,0.77973899510812839,0.077203401155966855,0.29141988305195327,0.27548021073090562,0.62170599174955166,0.92117119198747288,0.22541836601784981,0.28479555655095845,0.084633757289208522,0.32779707474589798,0.052098838713169643,0.61088435902621085,0.54457275013200823,0.34790379110112785,0.52696657837076599,0.99986129656112255,0.85524936481328451,0.90296345448298465,0.12880547991231042,0.61581807243544762,0.83034150681294006,0.98783082795265009,0.64561858090415947,0.75033229995275952,0.75737191723932962,0.6666441784728917,0.54002750223954288,0.73025470653451319,0.7797417912568716,0.48864703385541985,0.5937586680812138,0.96717315589314123,0.078377669698640781,0.035169148120987301,0.37221134184466426,0.077913408785959051,0.78418740996227654,0.47711991577670193,0.44065240237601522,0.28179312628502173,0.5168050097130471,0.36786059572694435,0.11789148815409142,0.59862499547703829,0.016582696039288143,0.46777718053112016,0.10650807929166588,0.33872911667368116,0.086891982522225783,0.2051579591936995,0.51777902545708288,0.47677920542266761,0.035190217559164842,0.85313907269957323,0.9506138612544236,0.85293546227676487,0.98030031889532698,0.86230742109630143]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12353","vectorTest":[0.82130702248722987,0.9186241625140954,0.59823175057913258,0.82677444923261578,0.53760613430941817,0.91313850580957534,0.16778230533418065,0.018888616561144433,0.85650013645039746,0.84137671503333744,0.78818697282168737,0.72951853437880876,0.73454747567935474,0.58646338610459858,0.56322989895808739,0.92423252863375127,0.60523717737783067,0.10132215259552757,0.61901455485986334,0.73382012763777205,0.29096417890059195,0.31862486346954216,0.57389281935306014,0.64126374312500078,0.055421672900554303,0.97991478334537507,0.79251656408869309,0.63918024288783604,0.32432636209359522,0.9726790432724185,0.50523496106227839,0.92144340084459286,0.56961180884957663,0.44634660576222368,0.9644654826551845,0.12019674014869007,0.98492383730834943,0.22996981635632008,0.59699592964111137,0.57034774100336616,0.76464368773990565,0.41218261105424869,0.055884800907335802,0.50600170827561475,0.17204127152434723,0.52902597369175597,0.53451704251200149,0.68268495125368245,0.47242774760386919,0.3308144240787394,0.37451495910346583,0.13792588610633247,0.72938972986852157,0.32929940199828933,0.6605067691215244,0.063743577493817358,0.90738567591048525,0.10465814744042191,0.099251408526371765,0.3925202309977241,0.66566938882541304,0.49559840269873012,0.46239214033813542,0.21128634923057701,0.40950281704058961,0.10772450495246307,0.48932110241618637,0.11182301270892964,0.35859875528277679,0.29066239379962194,0.32780356458362209,0.25902873683140926,0.54520721365249203,0.069631787542110954,0.017341512027321333,0.13491919052872706,0.89653988570293874,0.74232258366714543,0.60366320404305873,0.95288536858770467,0.91416284856262098,0.24235147367636539,0.90206768606790244,0.44313358172253092,0.56268228188900893,0.3511370644505763,0.036678086748390244,0.56378333591095398,0.34718907779809138,0.77172187033929218,0.6301894492101997,0.89260869808938548,0.8619935455244877,0.33761208011213228,0.66378191190711866,0.68436139747838953,0.40008104471235384,0.01839210625092124,0.0040859997984107168,0.39029852175984098,0.99994157765779978,0.20006453232932001,0.62804272888694401,0.91948257849243231,0.85278242934290061,0.82784204752069745,0.92361117014591654,0.27586964429918942,0.64500972410953861,0.41818816484999627,0.99204604336694946,0.47074555329563728,0.67533593788396917,0.93333014692671412,0.57110298126565762,0.43834231671080348,0.32004613833743212,0.91546389875969847,0.98210810491215195,0.27970508545893846,0.019777211168980006,0.33629865757128785,0.88614749984473973,0.11126526746074224,0.31207737611228842,0.52482484592440626,0.11937510433781434,0.76087648040617739]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12354","vectorTest":[0.31265323822866586,0.28898696136778479,0.32581034342832244,0.4116571229760157,0.26335448065321354,0.48641836815138084,0.22670860510024859,0.93130090111439345,0.044546970145443919,0.97647496610892903,0.78299561695797548,0.45052378923006187,0.21396262812970818,0.10309799774952215,0.54499044701817789,0.53608718224103835,0.66793490836282021,0.080833325453249033,0.64677732308666191,0.14574249117349108,0.40309229511871225,0.91843952843595855,0.42344651162391339,0.6423154572340809,0.61565416936927408,0.79964706879157843,0.44274479313500803,0.70024695295536565,0.81906407348331955,0.26536088297566807,0.055705387537330064,0.70748686552600393,0.8006655719443545,0.96730533518953232,0.99205181768606532,0.04223770313568824,0.92326281083232464,0.062820452781813585,0.38148959264674714,0.12771899044556073,0.43916055585884994,0.0086025580802102306,0.87400330601717835,0.88701412252952716,0.51849535885514442,0.30577770340823324,0.39639546674340481,0.40264022738594668,0.2249493095996562,0.55129953840906532,0.28589522916281113,0.37123154794387098,0.04945971030838911,0.81414304424438333,0.62129855381329036,0.31081234117221246,0.229096830918359,0.43081385097527836,0.72451673838089226,0.86206817407708924,0.68211163430283606,0.57721462044482363,0.80196840770352162,0.16708968884371983,0.63749432301138764,0.99158095219083042,0.90937719525642091,0.47388863306754708,0.61014860666972792,0.89781440163183579,0.71248061523183592,0.42557317447229925,0.52765755639824363,0.2285892200017475,0.8945813684977858,0.10878499614216956,0.94990810770142642,0.39808010838315533,0.028031065829517844,0.44706663881503356,0.20591714324483928,0.89086477895522354,0.94040956512155471,0.075977005152540786,0.13853079003848592,0.51399432954599922,0.86708658845729125,0.80627047789028261,0.84004596031108669,0.91662608354294028,0.82453455906913531,0.65617604233241755,0.90359382824439638,0.2785574998309937,0.8242417056502126,0.68329762037249242,0.71683092743807675,0.78088097371678511,0.66226071796824582,0.75080708984512923,0.20214098300627556,0.56878285154138963,0.64860908549723439,0.69780533109888565,0.6375211567052862,0.80913026625581441,0.78933857435172894,0.72537706237728294,0.02710776644268904,0.9530982047343034,0.13636698329591951,0.20891467502954517,0.13072549310254955,0.87139316906722586,0.88893312620717446,0.0063695346598086466,0.69535676681000802,0.81980114929185277,0.65441192709784035,0.4601282251094474,0.73311793326531804,0.56695487603881578,0.44377212229593344,0.52568060145480677,0.56590513790197583,0.64090397672669353,0.59603473118356753,0.76720121424140808]}}}
+{"appid":10008,"table_content":{"cmd":"add","fields":{"doc_id":"12355","vectorTest":[0.67810676580837181,0.13421466572583357,0.25083093417919566,0.33135466923252338,0.18346626722404674,0.59522134876650568,0.77430153763813014,0.34432489838214175,0.29537487142608049,0.9039396532630688,0.47633490375062953,0.57388812140075707,0.27272957599472919,0.15895720555689735,0.69658825723907714,0.193626339864134,0.13914813491434547,0.10669623883764742,0.30246918566812414,0.29443325816996935,0.0040217295691217037,0.13815482853161054,0.74397447096132752,0.17255257809047406,0.65089739508791233,0.80380974710730901,0.51908419056722643,0.088518434125616649,0.69736803003621795,0.86502103955261012,0.45535497267523978,0.88917447361735924,0.53201411515255659,0.32341796943462409,0.18112580106769566,0.55913517678826696,0.24954929050322169,0.37730780971257599,0.10234260994825629,0.98691653456915684,0.2533289211060637,0.41803545032885175,0.43667769039463317,0.73228695239795438,0.38384140495643992,0.29525424227775293,0.84881112942615444,0.12672813558177082,0.47693037905352675,0.78511103533431759,0.99804107585853163,0.79446762459192932,0.027299528580495055,0.95931306061417099,0.33549308149197554,0.047020827180929092,0.60593774709599257,0.97405717397740121,0.69139981493038116,0.67448450826925976,0.20081161669479264,0.20889479518408852,0.86334933475850872,0.8655373787086591,0.72112000293188572,0.020168148208130541,0.37465107735510672,0.31326104333420235,0.49830528847023176,0.72918336212682089,0.71494113101354773,0.12601531070141264,0.80061328465390602,0.33078745267173137,0.41369388353731662,0.32917482478582732,0.61541218307897283,0.31390747220587878,0.78384467911208999,0.67044839831046299,0.53743088758722912,0.72163531704395389,0.72978266345708775,0.63948188228408853,0.63619900111688898,0.10201007725815835,0.11472285350170748,0.55607979443268662,0.46696316783867342,0.74360437436173477,0.091721445833211013,0.79706476436000451,0.54563823196146477,0.97046833808482602,0.2058049134403544,0.90078616423315472,0.73511648455698619,0.78997177858845613,0.39748337429155289,0.90910258669025801,0.11458484944533004,0.14916663586992726,0.21470750841288119,0.13049222050077761,0.069356053478695626,0.089530436685474918,0.59674358746714229,0.95694353937847043,0.12084549653900037,0.85557082799421846,0.82223050117207352,0.47203539361681895,0.67260331159354492,0.16400607474205534,0.76594369341342561,0.24159144418786649,0.53736001734354355,0.72297219679073987,0.20547764609138386,0.58412451775740848,0.14447777213082161,0.45722565332879539,0.41714833443539767,0.39586392191392844,0.45094302827511457,0.62429363673927996,0.56010656724626351,0.78282753096055624]}}}
--- a/resource/vector_index/conf/ivfflat.faissindex
+++ b/resource/vector_index/conf/ivfflat.faissindex
--- a/resource/vector_index/conf/trained_index.faissindex
+++ b/resource/vector_index/conf/trained_index.faissindex
--- a/resource/vector_index/conf/vector_index.conf
+++ b/resource/vector_index/conf/vector_index.conf
@ -0,0 +1,11 @@
+{
+    "service_name":"vector_index",
+    "pid_file" : "vector_index.pid",
+    "log" : "../log/",
+    "log_level" : 3,
+    "daemon": true,
+    "listen_addr": "127.0.0.1",
+    "listen_port": 12004,
+    "socket_dir" : "/tmp/vector_index",
+    "rocksdb_storage_dir": "../rocksdb"
+}
--- a/src/3rdlib/faiss/AutoTune.h
+++ b/src/3rdlib/faiss/AutoTune.h
@ -0,0 +1,214 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_AUTO_TUNE_H
+#define FAISS_AUTO_TUNE_H
+
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+/**
+ * Evaluation criterion. Returns a performance measure in [0,1],
+ * higher is better.
+ */
+struct AutoTuneCriterion {
+    typedef Index::idx_t idx_t;
+    idx_t nq;     ///< nb of queries this criterion is evaluated on
+    idx_t nnn;    ///< nb of NNs that the query should request
+    idx_t gt_nnn; ///< nb of GT NNs required to evaluate criterion
+
+    std::vector<float> gt_D; ///< Ground-truth distances (size nq * gt_nnn)
+    std::vector<idx_t> gt_I; ///< Ground-truth indexes (size nq * gt_nnn)
+
+    AutoTuneCriterion(idx_t nq, idx_t nnn);
+
+    /** Intitializes the gt_D and gt_I vectors. Must be called before evaluating
+     *
+     * @param gt_D_in  size nq * gt_nnn
+     * @param gt_I_in  size nq * gt_nnn
+     */
+    void set_groundtruth(
+            int gt_nnn,
+            const float* gt_D_in,
+            const idx_t* gt_I_in);
+
+    /** Evaluate the criterion.
+     *
+     * @param D  size nq * nnn
+     * @param I  size nq * nnn
+     * @return the criterion, between 0 and 1. Larger is better.
+     */
+    virtual double evaluate(const float* D, const idx_t* I) const = 0;
+
+    virtual ~AutoTuneCriterion() {}
+};
+
+struct OneRecallAtRCriterion : AutoTuneCriterion {
+    idx_t R;
+
+    OneRecallAtRCriterion(idx_t nq, idx_t R);
+
+    double evaluate(const float* D, const idx_t* I) const override;
+
+    ~OneRecallAtRCriterion() override {}
+};
+
+struct IntersectionCriterion : AutoTuneCriterion {
+    idx_t R;
+
+    IntersectionCriterion(idx_t nq, idx_t R);
+
+    double evaluate(const float* D, const idx_t* I) const override;
+
+    ~IntersectionCriterion() override {}
+};
+
+/**
+ * Maintains a list of experimental results. Each operating point is a
+ * (perf, t, key) triplet, where higher perf and lower t is
+ * better. The key field is an arbitrary identifier for the operating point.
+ *
+ * Includes primitives to extract the Pareto-optimal operating points in the
+ * (perf, t) space.
+ */
+
+struct OperatingPoint {
+    double perf;     ///< performance measure (output of a Criterion)
+    double t;        ///< corresponding execution time (ms)
+    std::string key; ///< key that identifies this op pt
+    int64_t cno;     ///< integer identifer
+};
+
+struct OperatingPoints {
+    /// all operating points
+    std::vector<OperatingPoint> all_pts;
+
+    /// optimal operating points, sorted by perf
+    std::vector<OperatingPoint> optimal_pts;
+
+    // begins with a single operating point: t=0, perf=0
+    OperatingPoints();
+
+    /// add operating points from other to this, with a prefix to the keys
+    int merge_with(
+            const OperatingPoints& other,
+            const std::string& prefix = "");
+
+    void clear();
+
+    /// add a performance measure. Return whether it is an optimal point
+    bool add(double perf, double t, const std::string& key, size_t cno = 0);
+
+    /// get time required to obtain a given performance measure
+    double t_for_perf(double perf) const;
+
+    /// easy-to-read output
+    void display(bool only_optimal = true) const;
+
+    /// output to a format easy to digest by gnuplot
+    void all_to_gnuplot(const char* fname) const;
+    void optimal_to_gnuplot(const char* fname) const;
+};
+
+/// possible values of a parameter, sorted from least to most expensive/accurate
+struct ParameterRange {
+    std::string name;
+    std::vector<double> values;
+};
+
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+struct ParameterSpace {
+    /// all tunable parameters
+    std::vector<ParameterRange> parameter_ranges;
+
+    // exploration parameters
+
+    /// verbosity during exploration
+    int verbose;
+
+    /// nb of experiments during optimization (0 = try all combinations)
+    int n_experiments;
+
+    /// maximum number of queries to submit at a time.
+    size_t batchsize;
+
+    /// use multithreading over batches (useful to benchmark
+    /// independent single-searches)
+    bool thread_over_batches;
+
+    /// run tests several times until they reach at least this
+    /// duration (to avoid jittering in MT mode)
+    double min_test_duration;
+
+    ParameterSpace();
+
+    /// nb of combinations, = product of values sizes
+    size_t n_combinations() const;
+
+    /// returns whether combinations c1 >= c2 in the tuple sense
+    bool combination_ge(size_t c1, size_t c2) const;
+
+    /// get string representation of the combination
+    std::string combination_name(size_t cno) const;
+
+    /// print a description on stdout
+    void display() const;
+
+    /// add a new parameter (or return it if it exists)
+    ParameterRange& add_range(const std::string& name);
+
+    /// initialize with reasonable parameters for the index
+    virtual void initialize(const Index* index);
+
+    /// set a combination of parameters on an index
+    void set_index_parameters(Index* index, size_t cno) const;
+
+    /// set a combination of parameters described by a string
+    void set_index_parameters(Index* index, const char* param_string) const;
+
+    /// set one of the parameters, returns whether setting was successful
+    virtual void set_index_parameter(
+            Index* index,
+            const std::string& name,
+            double val) const;
+
+    /** find an upper bound on the performance and a lower bound on t
+     * for configuration cno given another operating point op */
+    void update_bounds(
+            size_t cno,
+            const OperatingPoint& op,
+            double* upper_bound_perf,
+            double* lower_bound_t) const;
+
+    /** explore operating points
+     * @param index   index to run on
+     * @param xq      query vectors (size nq * index.d)
+     * @param crit    selection criterion
+     * @param ops     resulting operating points
+     */
+    void explore(
+            Index* index,
+            size_t nq,
+            const float* xq,
+            const AutoTuneCriterion& crit,
+            OperatingPoints* ops) const;
+
+    virtual ~ParameterSpace() {}
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/Clustering.h
+++ b/src/3rdlib/faiss/Clustering.h
@ -0,0 +1,182 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_CLUSTERING_H
+#define FAISS_CLUSTERING_H
+#include <faiss/Index.h>
+
+#include <vector>
+
+namespace faiss {
+
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+struct ClusteringParameters {
+    int niter; ///< clustering iterations
+    int nredo; ///< redo clustering this many times and keep best
+
+    bool verbose;
+    bool spherical;        ///< do we want normalized centroids?
+    bool int_centroids;    ///< round centroids coordinates to integer
+    bool update_index;     ///< re-train index after each iteration?
+    bool frozen_centroids; ///< use the centroids provided as input and do not
+                           ///< change them during iterations
+
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid; ///< to limit size of dataset
+
+    int seed; ///< seed for the random number generator
+
+    size_t decode_block_size; ///< how many vectors at a time to decode
+
+    /// sets reasonable defaults
+    ClusteringParameters();
+};
+
+struct ClusteringIterationStats {
+    float obj;   ///< objective values (sum of distances reported by index)
+    double time; ///< seconds for iteration
+    double time_search;      ///< seconds for just search
+    double imbalance_factor; ///< imbalance factor of iteration
+    int nsplit;              ///< number of cluster splits
+};
+
+/** K-means clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centoids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ */
+struct Clustering : ClusteringParameters {
+    typedef Index::idx_t idx_t;
+    size_t d; ///< dimension of the vectors
+    size_t k; ///< nb of centroids
+
+    /** centroids (k * d)
+     * if centroids are set on input to train, they will be used as
+     * initialization
+     */
+    std::vector<float> centroids;
+
+    /// stats at every iteration of clustering
+    std::vector<ClusteringIterationStats> iteration_stats;
+
+    Clustering(int d, int k);
+    Clustering(int d, int k, const ClusteringParameters& cp);
+
+    /** run k-means training
+     *
+     * @param x          training vectors, size n * d
+     * @param index      index used for assignment
+     * @param x_weights  weight associated to each vector: NULL or size n
+     */
+    virtual void train(
+            idx_t n,
+            const float* x,
+            faiss::Index& index,
+            const float* x_weights = nullptr);
+
+    /** run with encoded vectors
+     *
+     * win addition to train()'s parameters takes a codec as parameter
+     * to decode the input vectors.
+     *
+     * @param codec      codec used to decode the vectors (nullptr =
+     *                   vectors are in fact floats)     *
+     */
+    void train_encoded(
+            idx_t nx,
+            const uint8_t* x_in,
+            const Index* codec,
+            Index& index,
+            const float* weights = nullptr);
+
+    /// Post-process the centroids after each centroid update.
+    /// includes optional L2 normalization and nearest integer rounding
+    void post_process_centroids();
+
+    virtual ~Clustering() {}
+};
+
+struct ProgressiveDimClusteringParameters : ClusteringParameters {
+    int progressive_dim_steps; ///< number of incremental steps
+    bool apply_pca;            ///< apply PCA on input
+
+    ProgressiveDimClusteringParameters();
+};
+
+/** generates an index suitable for clustering when called */
+struct ProgressiveDimIndexFactory {
+    /// ownership transferred to caller
+    virtual Index* operator()(int dim);
+
+    virtual ~ProgressiveDimIndexFactory() {}
+};
+
+/** K-means clustering with progressive dimensions used
+ *
+ * The clustering first happens in dim 1, then with exponentially increasing
+ * dimension until d (I steps). This is typically applied after a PCA
+ * transformation (optional). Reference:
+ *
+ * "Improved Residual Vector Quantization for High-dimensional Approximate
+ * Nearest Neighbor Search"
+ *
+ * Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15
+ *
+ * https://arxiv.org/abs/1509.05195
+ */
+struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
+    using idx_t = Index::idx_t;
+    size_t d; ///< dimension of the vectors
+    size_t k; ///< nb of centroids
+
+    /** centroids (k * d) */
+    std::vector<float> centroids;
+
+    /// stats at every iteration of clustering
+    std::vector<ClusteringIterationStats> iteration_stats;
+
+    ProgressiveDimClustering(int d, int k);
+    ProgressiveDimClustering(
+            int d,
+            int k,
+            const ProgressiveDimClusteringParameters& cp);
+
+    void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory);
+
+    virtual ~ProgressiveDimClustering() {}
+};
+
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @return final quantization error
+ */
+float kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids);
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IVFlib.h
+++ b/src/3rdlib/faiss/IVFlib.h
@ -0,0 +1,151 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_IVFLIB_H
+#define FAISS_IVFLIB_H
+
+/** Since IVF (inverted file) indexes are of so much use for
+ * large-scale use cases, we group a few functions related to them in
+ * this small library. Most functions work both on IndexIVFs and
+ * IndexIVFs embedded within an IndexPreTransform.
+ */
+
+#include <faiss/IndexIVF.h>
+#include <vector>
+
+namespace faiss {
+namespace ivflib {
+
+/** check if two indexes have the same parameters and are trained in
+ * the same way, otherwise throw. */
+void check_compatible_for_merge(const Index* index1, const Index* index2);
+
+/** get an IndexIVF from an index. The index may be an IndexIVF or
+ * some wrapper class that encloses an IndexIVF
+ *
+ * throws an exception if this is not the case.
+ */
+const IndexIVF* extract_index_ivf(const Index* index);
+IndexIVF* extract_index_ivf(Index* index);
+
+/// same as above but returns nullptr instead of throwing on failure
+const IndexIVF* try_extract_index_ivf(const Index* index);
+IndexIVF* try_extract_index_ivf(Index* index);
+
+/** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
+ *  embedded in a IndexPreTransform. On output, the index1 is empty.
+ *
+ * @param shift_ids: translate the ids from index1 to index0->prev_ntotal
+ */
+void merge_into(Index* index0, Index* index1, bool shift_ids);
+
+typedef Index::idx_t idx_t;
+
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param embeddings object descriptors for which the centroids should be found,
+ *                   size num_objects * d
+ * @param centroid_ids
+ *                   cluster id each object belongs to, size num_objects
+ */
+void search_centroid(Index* index, const float* x, int n, idx_t* centroid_ids);
+
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param query_centroid_ids
+ *                   centroid ids corresponding to the query vectors (size n)
+ * @param result_centroid_ids
+ *                   centroid ids corresponding to the results (size n * k)
+ * other arguments are the same as the standard search function
+ */
+void search_and_return_centroids(
+        Index* index,
+        size_t n,
+        const float* xin,
+        long k,
+        float* distances,
+        idx_t* labels,
+        idx_t* query_centroid_ids,
+        idx_t* result_centroid_ids);
+
+/** A set of IndexIVFs concatenated together in a FIFO fashion.
+ * at each "step", the oldest index slice is removed and a new index is added.
+ */
+struct SlidingIndexWindow {
+    /// common index that contains the sliding window
+    Index* index;
+
+    /// InvertedLists of index
+    ArrayInvertedLists* ils;
+
+    /// number of slices currently in index
+    int n_slice;
+
+    /// same as index->nlist
+    size_t nlist;
+
+    /// cumulative list sizes at each slice
+    std::vector<std::vector<size_t>> sizes;
+
+    /// index should be initially empty and trained
+    SlidingIndexWindow(Index* index);
+
+    /** Add one index to the current index and remove the oldest one.
+     *
+     * @param sub_index        slice to swap in (can be NULL)
+     * @param remove_oldest    if true, remove the oldest slices */
+    void step(const Index* sub_index, bool remove_oldest);
+};
+
+/// Get a subset of inverted lists [i0, i1)
+ArrayInvertedLists* get_invlist_range(const Index* index, long i0, long i1);
+
+/// Set a subset of inverted lists
+void set_invlist_range(Index* index, long i0, long i1, ArrayInvertedLists* src);
+
+/** search an IndexIVF, possibly embedded in an IndexPreTransform with
+ * given parameters. This is a way to set the nprobe and get
+ * statdistics in a thread-safe way.
+ *
+ * Optionally returns (if non-nullptr):
+ * - nb_dis: number of distances computed
+ * - ms_per_stage: [0]: preprocessing time
+ *                 [1]: coarse quantization,
+ *                 [2]: list scanning
+ */
+void search_with_parameters(
+        const Index* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params,
+        size_t* nb_dis = nullptr,
+        double* ms_per_stage = nullptr);
+
+/** same as search_with_parameters but for range search */
+void range_search_with_parameters(
+        const Index* index,
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const IVFSearchParameters* params,
+        size_t* nb_dis = nullptr,
+        double* ms_per_stage = nullptr);
+
+} // namespace ivflib
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/Index.h
+++ b/src/3rdlib/faiss/Index.h
@ -0,0 +1,248 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_H
+#define FAISS_INDEX_H
+
+#include <faiss/MetricType.h>
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+
+#define FAISS_VERSION_MAJOR 1
+#define FAISS_VERSION_MINOR 7
+#define FAISS_VERSION_PATCH 1
+
+/**
+ * @namespace faiss
+ *
+ * Throughout the library, vectors are provided as float * pointers.
+ * Most algorithms can be optimized when several vectors are processed
+ * (added/searched) together in a batch. In this case, they are passed
+ * in as a matrix. When n vectors of size d are provided as float * x,
+ * component j of vector i is
+ *
+ *   x[ i * d + j ]
+ *
+ * where 0 <= i < n and 0 <= j < d. In other words, matrices are
+ * always compact. When specifying the size of the matrix, we call it
+ * an n*d matrix, which implies a row-major storage.
+ */
+
+namespace faiss {
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+struct DistanceComputer;
+
+/** Abstract structure for an index, supports adding vectors and searching them.
+ *
+ * All vectors provided at add or search time are 32-bit float arrays,
+ * although the internal representation may vary.
+ */
+struct Index {
+    using idx_t = int64_t; ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;
+
+    int d;        ///< vector dimension
+    idx_t ntotal; ///< total nb of indexed vectors
+    bool verbose; ///< verbosity level
+
+    /// set if the Index does not require training, or if training is
+    /// done already
+    bool is_trained;
+
+    /// type of metric this index uses for search
+    MetricType metric_type;
+    float metric_arg; ///< argument of the metric type
+
+    explicit Index(idx_t d = 0, MetricType metric = METRIC_L2)
+            : d(d),
+              ntotal(0),
+              verbose(false),
+              is_trained(true),
+              metric_type(metric),
+              metric_arg(0) {}
+
+    virtual ~Index();
+
+    /** Perform training on a representative set of vectors
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * This function slices the input vectors in chunks smaller than
+     * blocksize_add and calls add_core.
+     * @param x      input matrix, size n * d
+     */
+    virtual void add(idx_t n, const float* x) = 0;
+
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids);
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const = 0;
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many
+     * indexes do not implement the range_search (only the k-NN search
+     * is mandatory).
+     *
+     * @param x           input vectors to search, size n * d
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const;
+
+    /** return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical as search but only return labels of neighbors.
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     */
+    virtual void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const;
+
+    /// removes all elements from the database.
+    virtual void reset() = 0;
+
+    /** removes IDs from the index. Not supported by all
+     * indexes. Returns the number of elements removed.
+     */
+    virtual size_t remove_ids(const IDSelector& sel);
+
+    /** Reconstruct a stored vector (or an approximation if lossy coding)
+     *
+     * this function may not be defined for some indexes
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d)
+     */
+    virtual void reconstruct(idx_t key, float* recons) const;
+
+    /** Reconstruct vectors i0 to i0 + ni - 1
+     *
+     * this function may not be defined for some indexes
+     * @param recons      reconstucted vector (size ni * d)
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t ni, float* recons) const;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting arrays
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            float* recons) const;
+
+    /** Computes a residual vector after indexing encoding.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param x           input vector, size d
+     * @param residual    output residual vector, size d
+     * @param key         encoded index, as returned by search and assign
+     */
+    virtual void compute_residual(const float* x, float* residual, idx_t key)
+            const;
+
+    /** Computes a residual vector after indexing encoding (batch form).
+     * Equivalent to calling compute_residual for each vector.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param n           number of vectors
+     * @param xs          input vectors, size (n x d)
+     * @param residuals   output residual vectors, size (n x d)
+     * @param keys        encoded index, as returned by search and assign
+     */
+    virtual void compute_residual_n(
+            idx_t n,
+            const float* xs,
+            float* residuals,
+            const idx_t* keys) const;
+
+    /** Get a DistanceComputer (defined in AuxIndexStructures) object
+     * for this kind of index.
+     *
+     * DistanceComputer is implemented for indexes that support random
+     * access of their vectors.
+     */
+    virtual DistanceComputer* get_distance_computer() const;
+
+    /* The standalone codec interface */
+
+    /** size of the produced codes in bytes */
+    virtual size_t sa_code_size() const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param x       input vectors, size n * d
+     * @param bytes   output encoded vectors, size n * sa_code_size()
+     */
+    virtual void sa_encode(idx_t n, const float* x, uint8_t* bytes) const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * sa_code_size()
+     * @param x       output vectors, size n * d
+     */
+    virtual void sa_decode(idx_t n, const uint8_t* bytes, float* x) const;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/Index2Layer.h
+++ b/src/3rdlib/faiss/Index2Layer.h
@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexPQ.h>
+
+namespace faiss {
+
+struct IndexIVFPQ;
+
+/** Same as an IndexIVFPQ without the inverted lists: codes are stored
+ * sequentially
+ *
+ * The class is mainly inteded to store encoded vectors that can be
+ * accessed randomly, the search function is not implemented.
+ */
+struct Index2Layer : Index {
+    /// first level quantizer
+    Level1Quantizer q1;
+
+    /// second level quantizer is always a PQ
+    ProductQuantizer pq;
+
+    /// Codes. Size ntotal * code_size.
+    std::vector<uint8_t> codes;
+
+    /// size of the code for the first level (ceil(log8(q1.nlist)))
+    size_t code_size_1;
+
+    /// size of the code for the second level
+    size_t code_size_2;
+
+    /// code_size_1 + code_size_2
+    size_t code_size;
+
+    Index2Layer(
+            Index* quantizer,
+            size_t nlist,
+            int M,
+            int nbit = 8,
+            MetricType metric = METRIC_L2);
+
+    Index2Layer();
+    ~Index2Layer();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    /// not implemented
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+
+    DistanceComputer* get_distance_computer() const override;
+
+    /// transfer the flat codes to an IVFPQ index
+    void transfer_to_IVFPQ(IndexIVFPQ& other) const;
+
+    /* The standalone codec interface */
+    size_t sa_code_size() const override;
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexAdditiveQuantizer.h
+++ b/src/3rdlib/faiss/IndexAdditiveQuantizer.h
@ -0,0 +1,207 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_INDEX_ADDITIVE_QUANTIZER_H
+#define FAISS_INDEX_ADDITIVE_QUANTIZER_H
+
+#include <faiss/impl/AdditiveQuantizer.h>
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/// Abstract class for additive quantizers. The search functions are in common.
+struct IndexAdditiveQuantizer : Index {
+    // the quantizer, this points to the relevant field in the inheriting
+    // classes
+    AdditiveQuantizer* aq;
+    using Search_type_t = AdditiveQuantizer::Search_type_t;
+
+    explicit IndexAdditiveQuantizer(
+            idx_t d = 0,
+            AdditiveQuantizer* aq = nullptr,
+            MetricType metric = METRIC_L2);
+
+    /// size of residual quantizer codes + norms
+    size_t code_size;
+
+    /// Codes. Size ntotal * rq.code_size
+    std::vector<uint8_t> codes;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reset() override;
+
+    void add(idx_t n, const float* x) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+/** Index based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ * Can also be used as a codec
+ */
+struct IndexResidualQuantizer : IndexAdditiveQuantizer {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexResidualQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexResidualQuantizer(
+            int d,
+            const std::vector<size_t>& nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexResidualQuantizer();
+
+    void train(idx_t n, const float* x) override;
+};
+
+struct IndexLocalSearchQuantizer : IndexAdditiveQuantizer {
+    LocalSearchQuantizer lsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexLocalSearchQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexLocalSearchQuantizer();
+
+    void train(idx_t n, const float* x) override;
+};
+
+/** A "virtual" index where the elements are the residual quantizer centroids.
+ *
+ * Intended for use as a coarse quantizer in an IndexIVF.
+ */
+struct AdditiveCoarseQuantizer : Index {
+    AdditiveQuantizer* aq;
+
+    explicit AdditiveCoarseQuantizer(
+            idx_t d = 0,
+            AdditiveQuantizer* aq = nullptr,
+            MetricType metric = METRIC_L2);
+
+    /// norms of centroids, useful for knn-search
+    std::vector<float> centroid_norms;
+
+    /// N/A
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+    void train(idx_t n, const float* x) override;
+
+    /// N/A
+    void reset() override;
+};
+
+/** The ResidualCoarseQuantizer is a bit specialized compared to the
+ * default AdditiveCoarseQuantizer because it can use a beam search
+ * at search time (slow but may be useful for very large vocabularies) */
+struct ResidualCoarseQuantizer : AdditiveCoarseQuantizer {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+
+    /// factor between the beam size and the search k
+    /// if negative, use exact search-to-centroid
+    float beam_factor;
+
+    /// computes centroid norms if required
+    void set_beam_factor(float new_beam_factor);
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    ResidualCoarseQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2);
+
+    ResidualCoarseQuantizer(
+            int d,
+            const std::vector<size_t>& nbits,
+            MetricType metric = METRIC_L2);
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    ResidualCoarseQuantizer();
+};
+
+struct LocalSearchCoarseQuantizer : AdditiveCoarseQuantizer {
+    /// The residual quantizer used to encode the vectors
+    LocalSearchQuantizer lsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    LocalSearchCoarseQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2);
+
+    LocalSearchCoarseQuantizer();
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexBinary.h
+++ b/src/3rdlib/faiss/IndexBinary.h
@ -0,0 +1,175 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_H
+#define FAISS_INDEX_BINARY_H
+
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+
+/** Abstract structure for a binary index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinary {
+    using idx_t = Index::idx_t; ///< all indices are this type
+    using component_t = uint8_t;
+    using distance_t = int32_t;
+
+    int d;         ///< vector dimension
+    int code_size; ///< number of bytes per vector ( = d / 8 )
+    idx_t ntotal;  ///< total nb of indexed vectors
+    bool verbose;  ///< verbosity level
+
+    /// set if the Index does not require training, or if training is done
+    /// already
+    bool is_trained;
+
+    /// type of metric this index uses for search
+    MetricType metric_type;
+
+    explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_L2)
+            : d(d),
+              code_size(d / 8),
+              ntotal(0),
+              verbose(false),
+              is_trained(true),
+              metric_type(metric) {
+        FAISS_THROW_IF_NOT(d % 8 == 0);
+    }
+
+    virtual ~IndexBinary();
+
+    /** Perform training on a representative set of vectors.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d / 8
+     */
+    virtual void train(idx_t n, const uint8_t* x);
+
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * @param x      input matrix, size n * d / 8
+     */
+    virtual void add(idx_t n, const uint8_t* x) = 0;
+
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids);
+
+    /** Query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d / 8
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const = 0;
+
+    /** Query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many indexes
+     * do not implement the range_search (only the k-NN search is
+     * mandatory). The distances are converted to float to reuse the
+     * RangeSearchResult structure, but they are integer. By convention,
+     * only distances < radius (strict comparison) are returned,
+     * ie. radius = 0 does not return any result and 1 returns only
+     * exact same vectors.
+     *
+     * @param x           input vectors to search, size n * d / 8
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const;
+
+    /** Return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical to search but only returns labels of
+     * neighbors.
+     * @param x           input vectors to search, size n * d / 8
+     * @param labels      output labels of the NNs, size n*k
+     */
+    void assign(idx_t n, const uint8_t* x, idx_t* labels, idx_t k = 1) const;
+
+    /// Removes all elements from the database.
+    virtual void reset() = 0;
+
+    /** Removes IDs from the index. Not supported by all indexes.
+     */
+    virtual size_t remove_ids(const IDSelector& sel);
+
+    /** Reconstruct a stored vector.
+     *
+     * This function may not be defined for some indexes.
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d / 8)
+     */
+    virtual void reconstruct(idx_t key, uint8_t* recons) const;
+
+    /** Reconstruct vectors i0 to i0 + ni - 1.
+     *
+     * This function may not be defined for some indexes.
+     * @param recons      reconstucted vectors (size ni * d / 8)
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting array
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels,
+            uint8_t* recons) const;
+
+    /** Display the actual class name and some more info. */
+    void display() const;
+};
+
+} // namespace faiss
+
+#endif // FAISS_INDEX_BINARY_H
--- a/src/3rdlib/faiss/IndexBinaryFlat.h
+++ b/src/3rdlib/faiss/IndexBinaryFlat.h
@ -0,0 +1,62 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_BINARY_FLAT_H
+#define INDEX_BINARY_FLAT_H
+
+#include <vector>
+
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+/** Index that stores the full vectors and performs exhaustive search. */
+struct IndexBinaryFlat : IndexBinary {
+    /// database vectors, size ntotal * d / 8
+    std::vector<uint8_t> xb;
+
+    /** Select between using a heap or counting to select the k smallest values
+     * when scanning inverted lists.
+     */
+    bool use_heap = true;
+
+    size_t query_batch_size = 32;
+
+    explicit IndexBinaryFlat(idx_t d);
+
+    void add(idx_t n, const uint8_t* x) override;
+
+    void reset() override;
+
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+
+    void reconstruct(idx_t key, uint8_t* recons) const override;
+
+    /** Remove some ids. Note that because of the indexing structure,
+     * the semantics of this operation are different from the usual ones:
+     * the new ids are shifted. */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    IndexBinaryFlat() {}
+};
+
+} // namespace faiss
+
+#endif // INDEX_BINARY_FLAT_H
--- a/src/3rdlib/faiss/IndexBinaryFromFloat.h
+++ b/src/3rdlib/faiss/IndexBinaryFromFloat.h
@ -0,0 +1,53 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_FROM_FLOAT_H
+#define FAISS_INDEX_BINARY_FROM_FLOAT_H
+
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+struct Index;
+
+/** IndexBinary backed by a float Index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinaryFromFloat : IndexBinary {
+    Index* index = nullptr;
+
+    bool own_fields = false; ///< Whether object owns the index pointer.
+
+    IndexBinaryFromFloat();
+
+    explicit IndexBinaryFromFloat(Index* index);
+
+    ~IndexBinaryFromFloat();
+
+    void add(idx_t n, const uint8_t* x) override;
+
+    void reset() override;
+
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const uint8_t* x) override;
+};
+
+} // namespace faiss
+
+#endif // FAISS_INDEX_BINARY_FROM_FLOAT_H
--- a/src/3rdlib/faiss/IndexBinaryHNSW.h
+++ b/src/3rdlib/faiss/IndexBinaryHNSW.h
@ -0,0 +1,57 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/HNSW.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+
+struct IndexBinaryHNSW : IndexBinary {
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    // the link strcuture
+    HNSW hnsw;
+
+    // the sequential storage
+    bool own_fields;
+    IndexBinary* storage;
+
+    explicit IndexBinaryHNSW();
+    explicit IndexBinaryHNSW(int d, int M = 32);
+    explicit IndexBinaryHNSW(IndexBinary* storage, int M = 32);
+
+    ~IndexBinaryHNSW() override;
+
+    DistanceComputer* get_distance_computer() const;
+
+    void add(idx_t n, const uint8_t* x) override;
+
+    /// Trains the storage if needed
+    void train(idx_t n, const uint8_t* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+
+    void reconstruct(idx_t key, uint8_t* recons) const override;
+
+    void reset() override;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexBinaryHash.h
+++ b/src/3rdlib/faiss/IndexBinaryHash.h
@ -0,0 +1,124 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_BINARY_HASH_H
+#define FAISS_BINARY_HASH_H
+
+#include <unordered_map>
+#include <vector>
+
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+struct RangeSearchResult;
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryHash : IndexBinary {
+    struct InvertedList {
+        std::vector<idx_t> ids;
+        std::vector<uint8_t> vecs;
+
+        void add(idx_t id, size_t code_size, const uint8_t* code);
+    };
+
+    using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
+    InvertedListMap invlists;
+
+    int b, nflip;
+
+    IndexBinaryHash(int d, int b);
+
+    IndexBinaryHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t* x) override;
+
+    void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override;
+
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+
+    void display() const;
+    size_t hashtable_size() const;
+};
+
+struct IndexBinaryHashStats {
+    size_t nq;    // nb of queries run
+    size_t n0;    // nb of empty lists
+    size_t nlist; // nb of non-empty inverted lists scanned
+    size_t ndis;  // nb of distancs computed
+
+    IndexBinaryHashStats() {
+        reset();
+    }
+    void reset();
+};
+
+FAISS_API extern IndexBinaryHashStats indexBinaryHash_stats;
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryMultiHash : IndexBinary {
+    // where the vectors are actually stored
+    IndexBinaryFlat* storage;
+    bool own_fields;
+
+    // maps hash values to the ids that hash to them
+    using Map = std::unordered_map<idx_t, std::vector<idx_t>>;
+
+    // the different hashes, size nhash
+    std::vector<Map> maps;
+
+    int nhash; ///< nb of hash maps
+    int b;     ///< nb bits per hash map
+    int nflip; ///< nb bit flips to use at search time
+
+    IndexBinaryMultiHash(int d, int nhash, int b);
+
+    IndexBinaryMultiHash();
+
+    ~IndexBinaryMultiHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t* x) override;
+
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+
+    size_t hashtable_size() const;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexBinaryIVF.h
+++ b/src/3rdlib/faiss/IndexBinaryIVF.h
@ -0,0 +1,250 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_IVF_H
+#define FAISS_INDEX_BINARY_IVF_H
+
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+struct BinaryInvertedListScanner;
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an IndexBinary instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * Otherwise the object is similar to the IndexIVF
+ */
+struct IndexBinaryIVF : IndexBinary {
+    /// Access to the actual data
+    InvertedLists* invlists;
+    bool own_invlists;
+
+    size_t nprobe;    ///< number of probes at query time
+    size_t max_codes; ///< max nb of codes to visit to do a query
+
+    /** Select between using a heap or counting to select the k smallest values
+     * when scanning inverted lists.
+     */
+    bool use_heap = true;
+
+    /// map for direct access to the elements. Enables reconstruct().
+    DirectMap direct_map;
+
+    IndexBinary* quantizer; ///< quantizer that maps vectors to inverted lists
+    size_t nlist;           ///< number of possible key values
+
+    bool own_fields; ///< whether object owns the quantizer
+
+    ClusteringParameters cp; ///< to override default clustering params
+    Index* clustering_index; ///< to override index used during clustering
+
+    /** The Inverted file takes a quantizer (an IndexBinary) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexBinaryIVF is in use.
+     */
+    IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist);
+
+    IndexBinaryIVF();
+
+    ~IndexBinaryIVF() override;
+
+    void reset() override;
+
+    /// Trains the quantizer
+    void train(idx_t n, const uint8_t* x) override;
+
+    void add(idx_t n, const uint8_t* x) override;
+
+    void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override;
+
+    /** Implementation of vector addition where the vector assignments are
+     * predefined.
+     *
+     * @param precomputed_idx    quantization indices for the input vectors
+     * (size n)
+     */
+    void add_core(
+            idx_t n,
+            const uint8_t* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx);
+
+    /** Search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. search() calls this.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    void search_preassigned(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            const idx_t* assign,
+            const int32_t* centroid_dis,
+            int32_t* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr) const;
+
+    virtual BinaryInvertedListScanner* get_InvertedListScanner(
+            bool store_pairs = false) const;
+
+    /** assign the vectors, then call search_preassign */
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+
+    void range_search_preassigned(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            const idx_t* assign,
+            const int32_t* centroid_dis,
+            RangeSearchResult* result) const;
+
+    void reconstruct(idx_t key, uint8_t* recons) const override;
+
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d / 8
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const override;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d / 8)
+     */
+    void search_and_reconstruct(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels,
+            uint8_t* recons) const override;
+
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset(
+            idx_t list_no,
+            idx_t offset,
+            uint8_t* recons) const;
+
+    /// Dataset manipulation functions
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from(IndexBinaryIVF& other, idx_t add_id);
+
+    size_t get_list_size(size_t list_no) const {
+        return invlists->list_size(list_no);
+    }
+
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map(bool new_maintain_direct_map = true);
+
+    void set_direct_map_type(DirectMap::Type type);
+
+    void replace_invlists(InvertedLists* il, bool own = false);
+};
+
+struct BinaryInvertedListScanner {
+    using idx_t = Index::idx_t;
+
+    /// from now on we handle this query.
+    virtual void set_query(const uint8_t* query_vector) = 0;
+
+    /// following codes come from this inverted list
+    virtual void set_list(idx_t list_no, uint8_t coarse_dis) = 0;
+
+    /// compute a single query-to-code distance
+    virtual uint32_t distance_to_code(const uint8_t* code) const = 0;
+
+    /** compute the distances to codes. (distances, labels) should be
+     * organized as a min- or max-heap
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     */
+    virtual size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            int32_t* distances,
+            idx_t* labels,
+            size_t k) const = 0;
+
+    virtual void scan_codes_range(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            int radius,
+            RangeQueryResult& result) const = 0;
+
+    virtual ~BinaryInvertedListScanner() {}
+};
+
+} // namespace faiss
+
+#endif // FAISS_INDEX_BINARY_IVF_H
--- a/src/3rdlib/faiss/IndexFlat.h
+++ b/src/3rdlib/faiss/IndexFlat.h
@ -0,0 +1,114 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_FLAT_H
+#define INDEX_FLAT_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/** Index that stores the full vectors and performs exhaustive search */
+struct IndexFlat : Index {
+    /// database vectors, size ntotal * d
+    std::vector<float> xb;
+
+    explicit IndexFlat(idx_t d, MetricType metric = METRIC_L2);
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    /** compute distance with a subset of vectors
+     *
+     * @param x       query vectors, size n * d
+     * @param labels  indices of the vectors that should be compared
+     *                for each query vector, size n * k
+     * @param distances
+     *                corresponding output distances, size n * k
+     */
+    void compute_distance_subset(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            const idx_t* labels) const;
+
+    /** remove some ids. NB that Because of the structure of the
+     * indexing structure, the semantics of this operation are
+     * different from the usual ones: the new ids are shifted */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    IndexFlat() {}
+
+    DistanceComputer* get_distance_computer() const override;
+
+    /* The stanadlone codec interface (just memcopies in this case) */
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+struct IndexFlatIP : IndexFlat {
+    explicit IndexFlatIP(idx_t d) : IndexFlat(d, METRIC_INNER_PRODUCT) {}
+    IndexFlatIP() {}
+};
+
+struct IndexFlatL2 : IndexFlat {
+    explicit IndexFlatL2(idx_t d) : IndexFlat(d, METRIC_L2) {}
+    IndexFlatL2() {}
+};
+
+/// optimized version for 1D "vectors".
+struct IndexFlat1D : IndexFlatL2 {
+    bool continuous_update; ///< is the permutation updated continuously?
+
+    std::vector<idx_t> perm; ///< sorted database indices
+
+    explicit IndexFlat1D(bool continuous_update = true);
+
+    /// if not continuous_update, call this between the last add and
+    /// the first search
+    void update_permutation();
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    /// Warn: the distances returned are L1 not L2
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexHNSW.h
+++ b/src/3rdlib/faiss/IndexHNSW.h
@ -0,0 +1,186 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/impl/HNSW.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+struct IndexHNSW;
+
+struct ReconstructFromNeighbors {
+    typedef Index::idx_t idx_t;
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    const IndexHNSW& index;
+    size_t M;   // number of neighbors
+    size_t k;   // number of codebook entries
+    size_t nsq; // number of subvectors
+    size_t code_size;
+    int k_reorder; // nb to reorder. -1 = all
+
+    std::vector<float> codebook; // size nsq * k * (M + 1)
+
+    std::vector<uint8_t> codes; // size ntotal * code_size
+    size_t ntotal;
+    size_t d, dsub; // derived values
+
+    explicit ReconstructFromNeighbors(
+            const IndexHNSW& index,
+            size_t k = 256,
+            size_t nsq = 1);
+
+    /// codes must be added in the correct order and the IndexHNSW
+    /// must be populated and sorted
+    void add_codes(size_t n, const float* x);
+
+    size_t compute_distances(
+            size_t n,
+            const idx_t* shortlist,
+            const float* query,
+            float* distances) const;
+
+    /// called by add_codes
+    void estimate_code(const float* x, storage_idx_t i, uint8_t* code) const;
+
+    /// called by compute_distances
+    void reconstruct(storage_idx_t i, float* x, float* tmp) const;
+
+    void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float* x) const;
+
+    /// get the M+1 -by-d table for neighbor coordinates for vector i
+    void get_neighbor_table(storage_idx_t i, float* out) const;
+};
+
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+
+struct IndexHNSW : Index {
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    // the link strcuture
+    HNSW hnsw;
+
+    // the sequential storage
+    bool own_fields;
+    Index* storage;
+
+    ReconstructFromNeighbors* reconstruct_from_neighbors;
+
+    explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
+    explicit IndexHNSW(Index* storage, int M = 32);
+
+    ~IndexHNSW() override;
+
+    void add(idx_t n, const float* x) override;
+
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+
+    void shrink_level_0_neighbors(int size);
+
+    /** Perform search only on level 0, given the starting points for
+     * each vertex.
+     *
+     * @param search_type 1:perform one search per nprobe, 2: enqueue
+     *                    all entry points
+     */
+    void search_level_0(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const storage_idx_t* nearest,
+            const float* nearest_d,
+            float* distances,
+            idx_t* labels,
+            int nprobe = 1,
+            int search_type = 1) const;
+
+    /// alternative graph building
+    void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
+
+    /// alternative graph building
+    void init_level_0_from_entry_points(
+            int npt,
+            const storage_idx_t* points,
+            const storage_idx_t* nearests);
+
+    // reorder links from nearest to farthest
+    void reorder_links();
+
+    void link_singletons();
+};
+
+/** Flat index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+
+struct IndexHNSWFlat : IndexHNSW {
+    IndexHNSWFlat();
+    IndexHNSWFlat(int d, int M, MetricType metric = METRIC_L2);
+};
+
+/** PQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWPQ : IndexHNSW {
+    IndexHNSWPQ();
+    IndexHNSWPQ(int d, int pq_m, int M);
+    void train(idx_t n, const float* x) override;
+};
+
+/** SQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWSQ : IndexHNSW {
+    IndexHNSWSQ();
+    IndexHNSWSQ(
+            int d,
+            ScalarQuantizer::QuantizerType qtype,
+            int M,
+            MetricType metric = METRIC_L2);
+};
+
+/** 2-level code structure with fast random access
+ */
+struct IndexHNSW2Level : IndexHNSW {
+    IndexHNSW2Level();
+    IndexHNSW2Level(Index* quantizer, size_t nlist, int m_pq, int M);
+
+    void flip_to_ivf();
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexIVF.h
+++ b/src/3rdlib/faiss/IndexIVF.h
@ -0,0 +1,434 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVF_H
+#define FAISS_INDEX_IVF_H
+
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/Index.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/invlists/DirectMap.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+/** Encapsulates a quantizer object for the IndexIVF
+ *
+ * The class isolates the fields that are independent of the storage
+ * of the lists (especially training)
+ */
+struct Level1Quantizer {
+    Index* quantizer; ///< quantizer that maps vectors to inverted lists
+    size_t nlist;     ///< number of possible key values
+
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
+    bool own_fields; ///< whether object owns the quantizer (false by default)
+
+    ClusteringParameters cp; ///< to override default clustering params
+    Index* clustering_index; ///< to override index used during clustering
+
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train_q1(
+            size_t n,
+            const float* x,
+            bool verbose,
+            MetricType metric_type);
+
+    /// compute the number of bytes required to store list ids
+    size_t coarse_code_size() const;
+    void encode_listno(Index::idx_t list_no, uint8_t* code) const;
+    Index::idx_t decode_listno(const uint8_t* code) const;
+
+    Level1Quantizer(Index* quantizer, size_t nlist);
+
+    Level1Quantizer();
+
+    ~Level1Quantizer();
+};
+
+struct IVFSearchParameters {
+    size_t nprobe;    ///< number of probes at query time
+    size_t max_codes; ///< max nb of codes to visit to do a query
+    IVFSearchParameters() : nprobe(1), max_codes(0) {}
+    virtual ~IVFSearchParameters() {}
+};
+
+struct InvertedListScanner;
+struct IndexIVFStats;
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an Index instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * The inverted list object is required only after trainng. If none is
+ * set externally, an ArrayInvertedLists is used automatically.
+ *
+ * At search time, the vector to be searched is also quantized, and
+ * only the list corresponding to the quantization index is
+ * searched. This speeds up the search by making it
+ * non-exhaustive. This can be relaxed using multi-probe search: a few
+ * (nprobe) quantization indices are selected and several inverted
+ * lists are visited.
+ *
+ * Sub-classes implement a post-filtering of the index that refines
+ * the distance estimation from the query to databse vectors.
+ */
+struct IndexIVF : Index, Level1Quantizer {
+    /// Access to the actual data
+    InvertedLists* invlists;
+    bool own_invlists;
+
+    size_t code_size; ///< code size per vector in bytes
+
+    size_t nprobe;    ///< number of probes at query time
+    size_t max_codes; ///< max nb of codes to visit to do a query
+
+    /** Parallel mode determines how queries are parallelized with OpenMP
+     *
+     * 0 (default): split over queries
+     * 1: parallelize over inverted lists
+     * 2: parallelize over both
+     * 3: split over queries with a finer granularity
+     *
+     * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
+     * prevent the heap to be initialized and finalized
+     */
+    int parallel_mode;
+    const int PARALLEL_MODE_NO_HEAP_INIT = 1024;
+
+    /** optional map that maps back ids to invlist entries. This
+     *  enables reconstruct() */
+    DirectMap direct_map;
+
+    /** The Inverted file takes a quantizer (an Index) on input,
+     * which implements the function mapping a vector to a list
+     * identifier.
+     */
+    IndexIVF(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t code_size,
+            MetricType metric = METRIC_L2);
+
+    void reset() override;
+
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train(idx_t n, const float* x) override;
+
+    /// Calls add_with_ids with NULL ids
+    void add(idx_t n, const float* x) override;
+
+    /// default implementation that calls encode_vectors
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    /** Implementation of vector addition where the vector assignments are
+     * predefined. The default implementation hands over the code extraction to
+     * encode_vectors.
+     *
+     * @param precomputed_idx    quantization indices for the input vectors
+     * (size n)
+     */
+    virtual void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx);
+
+    /** Encodes a set of vectors as they would appear in the inverted lists
+     *
+     * @param list_nos   inverted list ids as returned by the
+     *                   quantizer (size n). -1s are ignored.
+     * @param codes      output codes, size n * code_size
+     * @param include_listno
+     *                   include the list ids in the code (in this case add
+     *                   ceil(log8(nlist)) to the code size)
+     */
+    virtual void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listno = false) const = 0;
+
+    /** Add vectors that are computed with the standalone codec
+     *
+     * @param codes  codes to add size n * sa_code_size()
+     * @param xids   corresponding ids, size n
+     */
+    void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids);
+
+    /// Sub-classes that encode the residuals can train their encoders here
+    /// does nothing by default
+    virtual void train_residual(idx_t n, const float* x);
+
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     * @param stats  search stats to be updated (can be null)
+     */
+    virtual void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const;
+
+    /** assign the vectors, then call search_preassign */
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+
+    void range_search_preassigned(
+            idx_t nx,
+            const float* x,
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const;
+
+    /** Get a scanner for this index (store_pairs means ignore labels)
+     *
+     * The default search implementation uses this to compute the distances
+     */
+    virtual InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs = false) const;
+
+    /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2
+     */
+    void reconstruct(idx_t key, float* recons) const override;
+
+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    virtual void update_vectors(int nv, const idx_t* idx, const float* v);
+
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     */
+    void search_and_reconstruct(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            float* recons) const override;
+
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset(
+            int64_t list_no,
+            int64_t offset,
+            float* recons) const;
+
+    /// Dataset manipulation functions
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    void check_compatible_for_merge(const IndexIVF& other) const;
+
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from(IndexIVF& other, idx_t add_id);
+
+    /** copy a subset of the entries index to the other index
+     *
+     * if subset_type == 0: copies ids in [a1, a2)
+     * if subset_type == 1: copies ids if id % a1 == a2
+     * if subset_type == 2: copies inverted lists such that a1
+     *                      elements are left before and a2 elements are after
+     */
+    virtual void copy_subset_to(
+            IndexIVF& other,
+            int subset_type,
+            idx_t a1,
+            idx_t a2) const;
+
+    ~IndexIVF() override;
+
+    size_t get_list_size(size_t list_no) const {
+        return invlists->list_size(list_no);
+    }
+
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map(bool new_maintain_direct_map = true);
+
+    void set_direct_map_type(DirectMap::Type type);
+
+    /// replace the inverted lists, old one is deallocated if own_invlists
+    void replace_invlists(InvertedLists* il, bool own = false);
+
+    /* The standalone codec interface (except sa_decode that is specific) */
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    IndexIVF();
+};
+
+struct RangeQueryResult;
+
+/** Object that handles a query. The inverted lists to scan are
+ * provided externally. The object has a lot of state, but
+ * distance_to_code and scan_codes can be called in multiple
+ * threads */
+struct InvertedListScanner {
+    using idx_t = Index::idx_t;
+
+    idx_t list_no = -1;    ///< remember current list
+    bool keep_max = false; ///< keep maximum instead of minimum
+    /// store positions in invlists rather than labels
+    bool store_pairs = false;
+
+    /// used in default implementation of scan_codes
+    size_t code_size = 0;
+
+    /// from now on we handle this query.
+    virtual void set_query(const float* query_vector) = 0;
+
+    /// following codes come from this inverted list
+    virtual void set_list(idx_t list_no, float coarse_dis) = 0;
+
+    /// compute a single query-to-code distance
+    virtual float distance_to_code(const uint8_t* code) const = 0;
+
+    /** scan a set of codes, compute distances to current query and
+     * update heap of results if necessary. Default implemetation
+     * calls distance_to_code.
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     * @return number of heap updates performed
+     */
+    virtual size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* distances,
+            idx_t* labels,
+            size_t k) const;
+
+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& result) const;
+
+    virtual ~InvertedListScanner() {}
+};
+
+struct IndexIVFStats {
+    size_t nq;                // nb of queries run
+    size_t nlist;             // nb of inverted lists scanned
+    size_t ndis;              // nb of distances computed
+    size_t nheap_updates;     // nb of times the heap was updated
+    double quantization_time; // time spent quantizing vectors (in ms)
+    double search_time;       // time spent searching lists (in ms)
+
+    IndexIVFStats() {
+        reset();
+    }
+    void reset();
+    void add(const IndexIVFStats& other);
+};
+
+// global var that collects them all
+FAISS_API extern IndexIVFStats indexIVF_stats;
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexIVFAdditiveQuantizer.h
+++ b/src/3rdlib/faiss/IndexIVFAdditiveQuantizer.h
@ -0,0 +1,121 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_INDEX_IVF_ADDITIVE_QUANTIZER_H
+#define FAISS_INDEX_IVF_ADDITIVE_QUANTIZER_H
+
+#include <faiss/impl/AdditiveQuantizer.h>
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/// Abstract class for IVF additive quantizers.
+/// The search functions are in common.
+struct IndexIVFAdditiveQuantizer : IndexIVF {
+    // the quantizer
+    AdditiveQuantizer* aq;
+    bool by_residual = true;
+    int use_precomputed_table = 0; // for future use
+
+    using Search_type_t = AdditiveQuantizer::Search_type_t;
+
+    IndexIVFAdditiveQuantizer(
+            AdditiveQuantizer* aq,
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            MetricType metric = METRIC_L2);
+
+    explicit IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq);
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+
+    ~IndexIVFAdditiveQuantizer() override;
+};
+
+/** IndexIVF based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ */
+struct IndexIVFResidualQuantizer : IndexIVFAdditiveQuantizer {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexIVFResidualQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            const std::vector<size_t>& nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexIVFResidualQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,     /* number of subquantizers */
+            size_t nbits, /* number of bit per subvector index */
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexIVFResidualQuantizer();
+
+    virtual ~IndexIVFResidualQuantizer();
+};
+
+/** IndexIVF based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ */
+struct IndexIVFLocalSearchQuantizer : IndexIVFAdditiveQuantizer {
+    /// The LSQ quantizer used to encode the vectors
+    LocalSearchQuantizer lsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexIVFLocalSearchQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,     /* number of subquantizers */
+            size_t nbits, /* number of bit per subvector index */
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexIVFLocalSearchQuantizer();
+
+    virtual ~IndexIVFLocalSearchQuantizer();
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexIVFFlat.h
+++ b/src/3rdlib/faiss/IndexIVFFlat.h
@ -0,0 +1,106 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVF_FLAT_H
+#define FAISS_INDEX_IVF_FLAT_H
+
+#include <stdint.h>
+#include <unordered_map>
+
+#include <faiss/IndexIVF.h>
+
+namespace faiss {
+
+/** Inverted file with stored vectors. Here the inverted file
+ * pre-selects the vectors to be searched, but they are not otherwise
+ * encoded, the code array just contains the raw float entries.
+ */
+struct IndexIVFFlat : IndexIVF {
+    IndexIVFFlat(
+            Index* quantizer,
+            size_t d,
+            size_t nlist_,
+            MetricType = METRIC_L2);
+
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    IndexIVFFlat() {}
+};
+
+struct IndexIVFFlatDedup : IndexIVFFlat {
+    /** Maps ids stored in the index to the ids of vectors that are
+     *  the same. When a vector is unique, it does not appear in the
+     *  instances map */
+    std::unordered_multimap<idx_t, idx_t> instances;
+
+    IndexIVFFlatDedup(
+            Index* quantizer,
+            size_t d,
+            size_t nlist_,
+            MetricType = METRIC_L2);
+
+    /// also dedups the training set
+    void train(idx_t n, const float* x) override;
+
+    /// implemented for all IndexIVF* classes
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /// not implemented
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+
+    /// not implemented
+    void update_vectors(int nv, const idx_t* idx, const float* v) override;
+
+    /// not implemented
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    IndexIVFFlatDedup() {}
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexIVFPQ.h
+++ b/src/3rdlib/faiss/IndexIVFPQ.h
@ -0,0 +1,186 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVFPQ_H
+#define FAISS_INDEX_IVFPQ_H
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+struct IVFPQSearchParameters : IVFSearchParameters {
+    size_t scan_table_threshold; ///< use table computation or on-the-fly?
+    int polysemous_ht;           ///< Hamming thresh for polysemous filtering
+    IVFPQSearchParameters() : scan_table_threshold(0), polysemous_ht(0) {}
+    ~IVFPQSearchParameters() {}
+};
+
+FAISS_API extern size_t precomputed_table_max_bytes;
+
+/** Inverted file with Product Quantizer encoding. Each residual
+ * vector is encoded as a product quantizer code.
+ */
+struct IndexIVFPQ : IndexIVF {
+    bool by_residual; ///< Encode residual or plain vector?
+
+    ProductQuantizer pq; ///< produces the codes
+
+    bool do_polysemous_training; ///< reorder PQ centroids after training?
+    PolysemousTraining* polysemous_training; ///< if NULL, use default
+
+    // search-time parameters
+    size_t scan_table_threshold; ///< use table computation or on-the-fly?
+    int polysemous_ht;           ///< Hamming thresh for polysemous filtering
+
+    /** Precompute table that speed up query preprocessing at some
+     * memory cost (used only for by_residual with L2 metric)
+     */
+    int use_precomputed_table;
+
+    /// if use_precompute_table
+    /// size nlist * pq.M * pq.ksub
+    AlignedTable<float> precomputed_table;
+
+    IndexIVFPQ(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            MetricType metric = METRIC_L2);
+
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+
+    /// same as add_core, also:
+    /// - output 2nd level residuals if residuals_2 != NULL
+    /// - accepts precomputed_idx = nullptr
+    void add_core_o(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            float* residuals_2,
+            const idx_t* precomputed_idx = nullptr);
+
+    /// trains the product quantizer
+    void train_residual(idx_t n, const float* x) override;
+
+    /// same as train_residual, also output 2nd level residuals
+    void train_residual_o(idx_t n, const float* x, float* residuals_2);
+
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    /** Find exact duplicates in the dataset.
+     *
+     * the duplicates are returned in pre-allocated arrays (see the
+     * max sizes).
+     *
+     * @param lims   limits between groups of duplicates
+     *                (max size ntotal / 2 + 1)
+     * @param ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     *                duplicates (max size ntotal)
+     * @return n      number of groups found
+     */
+    size_t find_duplicates(idx_t* ids, size_t* lims) const;
+
+    // map a vector to a binary code knowning the index
+    void encode(idx_t key, const float* x, uint8_t* code) const;
+
+    /** Encode multiple vectors
+     *
+     * @param n       nb vectors to encode
+     * @param keys    posting list ids for those vectors (size n)
+     * @param x       vectors (size n * d)
+     * @param codes   output codes (size n * code_size)
+     * @param compute_keys  if false, assume keys are precomputed,
+     *                      otherwise compute them
+     */
+    void encode_multiple(
+            size_t n,
+            idx_t* keys,
+            const float* x,
+            uint8_t* codes,
+            bool compute_keys = false) const;
+
+    /// inverse of encode_multiple
+    void decode_multiple(
+            size_t n,
+            const idx_t* keys,
+            const uint8_t* xcodes,
+            float* x) const;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+
+    /// build precomputed table
+    void precompute_table();
+
+    IndexIVFPQ();
+};
+
+/** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
+ *
+ * @param use_precomputed_table (I/O)
+ *        =-1: force disable
+ *        =0: decide heuristically (default: use tables only if they are
+ *            < precomputed_tables_max_bytes), set use_precomputed_table on
+ * output =1: tables that work for all quantizers (size 256 * nlist * M) =2:
+ * specific version for MultiIndexQuantizer (much more compact)
+ * @param precomputed_table precomputed table to initialize
+ */
+
+void initialize_IVFPQ_precomputed_table(
+        int& use_precomputed_table,
+        const Index* quantizer,
+        const ProductQuantizer& pq,
+        AlignedTable<float>& precomputed_table,
+        bool verbose);
+
+/// statistics are robust to internal threading, but not if
+/// IndexIVFPQ::search_preassigned is called by multiple threads
+struct IndexIVFPQStats {
+    size_t nrefine; ///< nb of refines (IVFPQR)
+
+    size_t n_hamming_pass;
+    ///< nb of passed Hamming distance tests (for polysemous)
+
+    // timings measured with the CPU RTC on all threads
+    size_t search_cycles;
+    size_t refine_cycles; ///< only for IVFPQR
+
+    IndexIVFPQStats() {
+        reset();
+    }
+    void reset();
+};
+
+// global var that collects them all
+FAISS_API extern IndexIVFPQStats indexIVFPQ_stats;
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexIVFPQFastScan.h
+++ b/src/3rdlib/faiss/IndexIVFPQFastScan.h
@ -0,0 +1,191 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+/** Fast scan version of IVFPQ. Works for 4-bit PQ for now.
+ *
+ * The codes in the inverted lists are not stored sequentially but
+ * grouped in blocks of size bbs. This makes it possible to very quickly
+ * compute distances with SIMD instructions.
+ *
+ * Implementations (implem):
+ * 0: auto-select implementation (default)
+ * 1: orig's search, re-implemented
+ * 2: orig's search, re-ordered by invlist
+ * 10: optimizer int16 search, collect results in heap, no qbs
+ * 11: idem, collect results in reservoir
+ * 12: optimizer int16 search, collect results in heap, uses qbs
+ * 13: idem, collect results in reservoir
+ */
+
+struct IndexIVFPQFastScan : IndexIVF {
+    bool by_residual;    ///< Encode residual or plain vector?
+    ProductQuantizer pq; ///< produces the codes
+
+    // size of the kernel
+    int bbs; // set at build time
+
+    // M rounded up to a multiple of 2
+    size_t M2;
+
+    /// precomputed tables management
+    int use_precomputed_table = 0;
+    /// if use_precompute_table size (nlist, pq.M, pq.ksub)
+    AlignedTable<float> precomputed_table;
+
+    // search-time implementation
+    int implem = 0;
+    // skip some parts of the computation (for timing)
+    int skip = 0;
+
+    // batching factors at search time (0 = default)
+    int qbs = 0;
+    size_t qbs2 = 0;
+
+    IndexIVFPQFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            MetricType metric = METRIC_L2,
+            int bbs = 32);
+
+    IndexIVFPQFastScan();
+
+    // built from an IndexIVFPQ
+    explicit IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs = 32);
+
+    /// orig's inverted lists (for debugging)
+    InvertedLists* orig_invlists = nullptr;
+
+    void train_residual(idx_t n, const float* x) override;
+
+    /// build precomputed table, possibly updating use_precomputed_table
+    void precompute_table();
+
+    /// same as the regular IVFPQ encoder. The codes are not reorganized by
+    /// blocks a that point
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listno = false) const override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    // prepare look-up tables
+
+    void compute_LUT(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<float>& dis_tables,
+            AlignedTable<float>& biases) const;
+
+    void compute_LUT_uint8(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<uint8_t>& dis_tables,
+            AlignedTable<uint16_t>& biases,
+            float* normalizers) const;
+
+    // internal search funcs
+
+    template <bool is_max>
+    void search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    template <class C>
+    void search_implem_1(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    template <class C>
+    void search_implem_2(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    // implem 10 and 12 are not multithreaded internally, so
+    // export search stats
+    template <class C>
+    void search_implem_10(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out) const;
+
+    template <class C>
+    void search_implem_12(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out) const;
+};
+
+struct IVFFastScanStats {
+    uint64_t times[10];
+    uint64_t t_compute_distance_tables, t_round;
+    uint64_t t_copy_pack, t_scan, t_to_flat;
+    uint64_t reservoir_times[4];
+
+    double Mcy_at(int i) {
+        return times[i] / (1000 * 1000.0);
+    }
+
+    double Mcy_reservoir_at(int i) {
+        return reservoir_times[i] / (1000 * 1000.0);
+    }
+    IVFFastScanStats() {
+        reset();
+    }
+    void reset() {
+        memset(this, 0, sizeof(*this));
+    }
+};
+
+FAISS_API extern IVFFastScanStats IVFFastScan_stats;
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexIVFPQR.h
+++ b/src/3rdlib/faiss/IndexIVFPQR.h
@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexIVFPQ.h>
+
+namespace faiss {
+
+/** Index with an additional level of PQ refinement */
+struct IndexIVFPQR : IndexIVFPQ {
+    ProductQuantizer refine_pq;        ///< 3rd level quantizer
+    std::vector<uint8_t> refine_codes; ///< corresponding codes
+
+    /// factor between k requested in search and the k requested from the IVFPQ
+    float k_factor;
+
+    IndexIVFPQR(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            size_t M_refine,
+            size_t nbits_per_idx_refine);
+
+    void reset() override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /// trains the two product quantizers
+    void train_residual(idx_t n, const float* x) override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    /// same as add_with_ids, but optionally use the precomputed list ids
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    void merge_from(IndexIVF& other, idx_t add_id) override;
+
+    void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
+
+    IndexIVFPQR();
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexIVFSpectralHash.h
+++ b/src/3rdlib/faiss/IndexIVFSpectralHash.h
@ -0,0 +1,73 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVFSH_H
+#define FAISS_INDEX_IVFSH_H
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+
+namespace faiss {
+
+struct VectorTransform;
+
+/** Inverted list that stores binary codes of size nbit. Before the
+ * binary conversion, the dimension of the vectors is transformed from
+ * dim d into dim nbit by vt (a random rotation by default).
+ *
+ * Each coordinate is subtracted from a value determined by
+ * threshold_type, and split into intervals of size period. Half of
+ * the interval is a 0 bit, the other half a 1.
+ */
+struct IndexIVFSpectralHash : IndexIVF {
+    VectorTransform* vt; // transformation from d to nbit dim
+    bool own_fields;
+
+    int nbit;
+    float period;
+
+    enum ThresholdType {
+        Thresh_global,
+        Thresh_centroid,
+        Thresh_centroid_half,
+        Thresh_median
+    };
+    ThresholdType threshold_type;
+
+    // size nlist * nbit or 0 if Thresh_global
+    std::vector<float> trained;
+
+    IndexIVFSpectralHash(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            int nbit,
+            float period);
+
+    IndexIVFSpectralHash();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+
+    ~IndexIVFSpectralHash() override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexLSH.h
+++ b/src/3rdlib/faiss/IndexLSH.h
@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_LSH_H
+#define INDEX_LSH_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+/** The sign of each vector component is put in a binary signature */
+struct IndexLSH : Index {
+    typedef unsigned char uint8_t;
+
+    int nbits;             ///< nb of bits per vector
+    int bytes_per_vec;     ///< nb of 8-bits per encoded vector
+    bool rotate_data;      ///< whether to apply a random rotation to input
+    bool train_thresholds; ///< whether we train thresholds or use 0
+
+    RandomRotationMatrix rrot; ///< optional random rotation
+
+    std::vector<float> thresholds; ///< thresholds to compare with
+
+    /// encoded dataset
+    std::vector<uint8_t> codes;
+
+    IndexLSH(
+            idx_t d,
+            int nbits,
+            bool rotate_data = true,
+            bool train_thresholds = false);
+
+    /** Preprocesses and resizes the input to the size required to
+     * binarize the data
+     *
+     * @param x input vectors, size n * d
+     * @return output vectors, size n * bits. May be the same pointer
+     *         as x, otherwise it should be deleted by the caller
+     */
+    const float* apply_preprocess(idx_t n, const float* x) const;
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reset() override;
+
+    /// transfer the thresholds to a pre-processing stage (and unset
+    /// train_thresholds)
+    void transfer_thresholds(LinearTransform* vt);
+
+    ~IndexLSH() override {}
+
+    IndexLSH();
+
+    /* standalone codec interface.
+     *
+     * The vectors are decoded to +/- 1 (not 0, 1) */
+
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexLattice.h
+++ b/src/3rdlib/faiss/IndexLattice.h
@ -0,0 +1,63 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_LATTICE_H
+#define FAISS_INDEX_LATTICE_H
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/lattice_Zn.h>
+
+namespace faiss {
+
+/** Index that encodes a vector with a series of Zn lattice quantizers
+ */
+struct IndexLattice : Index {
+    /// number of sub-vectors
+    int nsq;
+    /// dimension of sub-vectors
+    size_t dsq;
+
+    /// the lattice quantizer
+    ZnSphereCodecAlt zn_sphere_codec;
+
+    /// nb bits used to encode the scale, per subvector
+    int scale_nbit, lattice_nbit;
+    /// total, in bytes
+    size_t code_size;
+
+    /// mins and maxes of the vector norms, per subquantizer
+    std::vector<float> trained;
+
+    IndexLattice(idx_t d, int nsq, int scale_nbit, int r2);
+
+    void train(idx_t n, const float* x) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    /// not implemented
+    void add(idx_t n, const float* x) override;
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    void reset() override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexNNDescent.h
+++ b/src/3rdlib/faiss/IndexNNDescent.h
@ -0,0 +1,72 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/NNDescent.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/** The NNDescent index is a normal random-access index with an NNDescent
+ * link structure built on top */
+
+struct IndexNNDescent : Index {
+    // internal storage of vectors (32 bits)
+    using storage_idx_t = NNDescent::storage_idx_t;
+
+    /// Faiss results are 64-bit
+    using idx_t = Index::idx_t;
+
+    // the link strcuture
+    NNDescent nndescent;
+
+    // the sequential storage
+    bool own_fields;
+    Index* storage;
+
+    explicit IndexNNDescent(
+            int d = 0,
+            int K = 32,
+            MetricType metric = METRIC_L2);
+    explicit IndexNNDescent(Index* storage, int K = 32);
+
+    ~IndexNNDescent() override;
+
+    void add(idx_t n, const float* x) override;
+
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+};
+
+/** Flat index topped with with a NNDescent structure to access elements
+ *  more efficiently.
+ */
+
+struct IndexNNDescentFlat : IndexNNDescent {
+    IndexNNDescentFlat();
+    IndexNNDescentFlat(int d, int K, MetricType metric = METRIC_L2);
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexNSG.h
+++ b/src/3rdlib/faiss/IndexNSG.h
@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexNNDescent.h>
+#include <faiss/impl/NSG.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/** The NSG index is a normal random-access index with a NSG
+ * link structure built on top */
+
+struct IndexNSG : Index {
+    /// the link strcuture
+    NSG nsg;
+
+    /// the sequential storage
+    bool own_fields;
+    Index* storage;
+
+    /// the index is built or not
+    bool is_built;
+
+    /// K of KNN graph for building
+    int GK;
+
+    /// indicate how to build a knn graph
+    /// - 0: build NSG with brute force search
+    /// - 1: build NSG with NNDescent
+    char build_type;
+
+    /// parameters for nndescent
+    int nndescent_S;
+    int nndescent_R;
+    int nndescent_L;
+    int nndescent_iter;
+
+    explicit IndexNSG(int d = 0, int R = 32, MetricType metric = METRIC_L2);
+    explicit IndexNSG(Index* storage, int R = 32);
+
+    ~IndexNSG() override;
+
+    void build(idx_t n, const float* x, idx_t* knn_graph, int GK);
+
+    void add(idx_t n, const float* x) override;
+
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+
+    void check_knn_graph(const idx_t* knn_graph, idx_t n, int K) const;
+};
+
+/** Flat index topped with with a NSG structure to access elements
+ *  more efficiently.
+ */
+
+struct IndexNSGFlat : IndexNSG {
+    IndexNSGFlat();
+    IndexNSGFlat(int d, int R, MetricType metric = METRIC_L2);
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexPQ.h
+++ b/src/3rdlib/faiss/IndexPQ.h
@ -0,0 +1,198 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_INDEX_PQ_H
+#define FAISS_INDEX_PQ_H
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/PolysemousTraining.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/** Index based on a product quantizer. Stored vectors are
+ * approximated by PQ codes. */
+struct IndexPQ : Index {
+    /// The product quantizer used to encode the vectors
+    ProductQuantizer pq;
+
+    /// Codes. Size ntotal * pq.code_size
+    std::vector<uint8_t> codes;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexPQ(int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2);
+
+    IndexPQ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reset() override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    DistanceComputer* get_distance_computer() const override;
+
+    /******************************************************
+     * Polysemous codes implementation
+     ******************************************************/
+    bool do_polysemous_training; ///< false = standard PQ
+
+    /// parameters used for the polysemous training
+    PolysemousTraining polysemous_training;
+
+    /// how to perform the search in search_core
+    enum Search_type_t {
+        ST_PQ,                    ///< asymmetric product quantizer (default)
+        ST_HE,                    ///< Hamming distance on codes
+        ST_generalized_HE,        ///< nb of same codes
+        ST_SDC,                   ///< symmetric product quantizer (SDC)
+        ST_polysemous,            ///< HE filter (using ht) + PQ combination
+        ST_polysemous_generalize, ///< Filter on generalized Hamming
+    };
+
+    Search_type_t search_type;
+
+    // just encode the sign of the components, instead of using the PQ encoder
+    // used only for the queries
+    bool encode_signs;
+
+    /// Hamming threshold used for polysemy
+    int polysemous_ht;
+
+    // actual polysemous search
+    void search_core_polysemous(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    /// prepare query for a polysemous search, but instead of
+    /// computing the result, just get the histogram of Hamming
+    /// distances. May be computed on a provided dataset if xb != NULL
+    /// @param dist_histogram (M * nbits + 1)
+    void hamming_distance_histogram(
+            idx_t n,
+            const float* x,
+            idx_t nb,
+            const float* xb,
+            int64_t* dist_histogram);
+
+    /** compute pairwise distances between queries and database
+     *
+     * @param n    nb of query vectors
+     * @param x    query vector, size n * d
+     * @param dis  output distances, size n * ntotal
+     */
+    void hamming_distance_table(idx_t n, const float* x, int32_t* dis) const;
+};
+
+/// statistics are robust to internal threading, but not if
+/// IndexPQ::search is called by multiple threads
+struct IndexPQStats {
+    size_t nq;    // nb of queries run
+    size_t ncode; // nb of codes visited
+
+    size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy)
+
+    IndexPQStats() {
+        reset();
+    }
+    void reset();
+};
+
+FAISS_API extern IndexPQStats indexPQ_stats;
+
+/** Quantizer where centroids are virtual: they are the Cartesian
+ *  product of sub-centroids. */
+struct MultiIndexQuantizer : Index {
+    ProductQuantizer pq;
+
+    MultiIndexQuantizer(
+            int d,         ///< dimension of the input vectors
+            size_t M,      ///< number of subquantizers
+            size_t nbits); ///< number of bit per subvector index
+
+    void train(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    /// add and reset will crash at runtime
+    void add(idx_t n, const float* x) override;
+    void reset() override;
+
+    MultiIndexQuantizer() {}
+
+    void reconstruct(idx_t key, float* recons) const override;
+};
+
+/** MultiIndexQuantizer where the PQ assignmnet is performed by sub-indexes
+ */
+struct MultiIndexQuantizer2 : MultiIndexQuantizer {
+    /// M Indexes on d / M dimensions
+    std::vector<Index*> assign_indexes;
+    bool own_fields;
+
+    MultiIndexQuantizer2(int d, size_t M, size_t nbits, Index** indexes);
+
+    MultiIndexQuantizer2(
+            int d,
+            size_t nbits,
+            Index* assign_index_0,
+            Index* assign_index_1);
+
+    void train(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexPQFastScan.h
+++ b/src/3rdlib/faiss/IndexPQFastScan.h
@ -0,0 +1,125 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+/** Fast scan version of IndexPQ. Works for 4-bit PQ for now.
+ *
+ * The codes are not stored sequentially but grouped in blocks of size bbs.
+ * This makes it possible to compute distances quickly with SIMD instructions.
+ *
+ * Implementations:
+ * 12: blocked loop with internal loop on Q with qbs
+ * 13: same with reservoir accumulator to store results
+ * 14: no qbs with heap accumulator
+ * 15: no qbs with reservoir accumulator
+ */
+
+struct IndexPQFastScan : Index {
+    ProductQuantizer pq;
+
+    // implementation to select
+    int implem = 0;
+    // skip some parts of the computation (for timing)
+    int skip = 0;
+
+    // size of the kernel
+    int bbs;     // set at build time
+    int qbs = 0; // query block size 0 = use default
+
+    // packed version of the codes
+    size_t ntotal2;
+    size_t M2;
+
+    AlignedTable<uint8_t> codes;
+
+    // this is for testing purposes only (set when initialized by IndexPQ)
+    const uint8_t* orig_codes = nullptr;
+
+    IndexPQFastScan(
+            int d,
+            size_t M,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            int bbs = 32);
+
+    IndexPQFastScan();
+
+    /// build from an existing IndexPQ
+    explicit IndexPQFastScan(const IndexPQ& orig, int bbs = 32);
+
+    void train(idx_t n, const float* x) override;
+    void add(idx_t n, const float* x) override;
+    void reset() override;
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    // called by search function
+    void compute_quantized_LUT(
+            idx_t n,
+            const float* x,
+            uint8_t* lut,
+            float* normalizers) const;
+
+    template <bool is_max>
+    void search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    template <class C>
+    void search_implem_2(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    template <class C>
+    void search_implem_12(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl) const;
+
+    template <class C>
+    void search_implem_14(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl) const;
+};
+
+struct FastScanStats {
+    uint64_t t0, t1, t2, t3;
+    FastScanStats() {
+        reset();
+    }
+    void reset() {
+        memset(this, 0, sizeof(*this));
+    }
+};
+
+FAISS_API extern FastScanStats FastScan_stats;
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexPreTransform.h
+++ b/src/3rdlib/faiss/IndexPreTransform.h
@ -0,0 +1,90 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+/** Index that applies a LinearTransform transform on vectors before
+ *  handing them over to a sub-index */
+struct IndexPreTransform : Index {
+    std::vector<VectorTransform*> chain; ///! chain of tranforms
+    Index* index;                        ///! the sub-index
+
+    bool own_fields; ///! whether pointers are deleted in destructor
+
+    explicit IndexPreTransform(Index* index);
+
+    IndexPreTransform();
+
+    /// ltrans is the last transform before the index
+    IndexPreTransform(VectorTransform* ltrans, Index* index);
+
+    void prepend_transform(VectorTransform* ltrans);
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void reset() override;
+
+    /** removes IDs from the index. Not supported by all indexes.
+     */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    /* range search, no attempt is done to change the radius */
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void search_and_reconstruct(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            float* recons) const override;
+
+    /// apply the transforms in the chain. The returned float * may be
+    /// equal to x, otherwise it should be deallocated.
+    const float* apply_chain(idx_t n, const float* x) const;
+
+    /// Reverse the transforms in the chain. May not be implemented for
+    /// all transforms in the chain or may return approximate results.
+    void reverse_chain(idx_t n, const float* xt, float* x) const;
+
+    DistanceComputer* get_distance_computer() const override;
+
+    /* standalone codec interface */
+    size_t sa_code_size() const override;
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    ~IndexPreTransform() override;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexRefine.h
+++ b/src/3rdlib/faiss/IndexRefine.h
@ -0,0 +1,72 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/** Index that queries in a base_index (a fast one) and refines the
+ *  results with an exact search, hopefully improving the results.
+ */
+struct IndexRefine : Index {
+    /// faster index to pre-select the vectors that should be filtered
+    Index* base_index;
+
+    /// refinement index
+    Index* refine_index;
+
+    bool own_fields;       ///< should the base index be deallocated?
+    bool own_refine_index; ///< same with the refinement index
+
+    /// factor between k requested in search and the k requested from
+    /// the base_index (should be >= 1)
+    float k_factor = 1;
+
+    /// initialize from empty index
+    IndexRefine(Index* base_index, Index* refine_index);
+
+    IndexRefine();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    // reconstruct is routed to the refine_index
+    void reconstruct(idx_t key, float* recons) const override;
+
+    ~IndexRefine() override;
+};
+
+/** Version where the refinement index is an IndexFlat. It has one additional
+ * constructor that takes a table of elements to add to the flat refinement
+ * index */
+struct IndexRefineFlat : IndexRefine {
+    explicit IndexRefineFlat(Index* base_index);
+    IndexRefineFlat(Index* base_index, const float* xb);
+
+    IndexRefineFlat();
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexReplicas.h
+++ b/src/3rdlib/faiss/IndexReplicas.h
@ -0,0 +1,88 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/// Takes individual faiss::Index instances, and splits queries for
+/// sending to each Index instance, and joins the results together
+/// when done.
+/// Each index is managed by a separate CPU thread.
+template <typename IndexT>
+class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
+   public:
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    /// The dimension that all sub-indices must share will be the dimension of
+    /// the first sub-index added
+    /// @param threaded do we use one thread per sub-index or do queries
+    /// sequentially?
+    explicit IndexReplicasTemplate(bool threaded = true);
+
+    /// @param d the dimension that all sub-indices must share
+    /// @param threaded do we use one thread per sub index or do queries
+    /// sequentially?
+    explicit IndexReplicasTemplate(idx_t d, bool threaded = true);
+
+    /// int version due to the implicit bool conversion ambiguity of int as
+    /// dimension
+    explicit IndexReplicasTemplate(int d, bool threaded = true);
+
+    /// Alias for addIndex()
+    void add_replica(IndexT* index) {
+        this->addIndex(index);
+    }
+
+    /// Alias for removeIndex()
+    void remove_replica(IndexT* index) {
+        this->removeIndex(index);
+    }
+
+    /// faiss::Index API
+    /// All indices receive the same call
+    void train(idx_t n, const component_t* x) override;
+
+    /// faiss::Index API
+    /// All indices receive the same call
+    void add(idx_t n, const component_t* x) override;
+
+    /// faiss::Index API
+    /// Query is partitioned into a slice for each sub-index
+    /// split by ceil(n / #indices) for our sub-indices
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+
+    /// reconstructs from the first index
+    void reconstruct(idx_t, component_t* v) const override;
+
+    /// Synchronize the top-level index (IndexShards) with data in the
+    /// sub-indices
+    void syncWithSubIndexes();
+
+   protected:
+    /// Called just after an index is added
+    void onAfterAddIndex(IndexT* index) override;
+
+    /// Called just after an index is removed
+    void onAfterRemoveIndex(IndexT* index) override;
+};
+
+using IndexReplicas = IndexReplicasTemplate<Index>;
+using IndexBinaryReplicas = IndexReplicasTemplate<IndexBinary>;
+
+} // namespace faiss
--- a/src/3rdlib/faiss/IndexScalarQuantizer.h
+++ b/src/3rdlib/faiss/IndexScalarQuantizer.h
@ -0,0 +1,123 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_SCALAR_QUANTIZER_H
+#define FAISS_INDEX_SCALAR_QUANTIZER_H
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+namespace faiss {
+
+/**
+ * The uniform quantizer has a range [vmin, vmax]. The range can be
+ * the same for all dimensions (uniform) or specific per dimension
+ * (default).
+ */
+
+struct IndexScalarQuantizer : Index {
+    /// Used to encode the vectors
+    ScalarQuantizer sq;
+
+    /// Codes. Size ntotal * pq.code_size
+    std::vector<uint8_t> codes;
+
+    size_t code_size;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexScalarQuantizer(
+            int d,
+            ScalarQuantizer::QuantizerType qtype,
+            MetricType metric = METRIC_L2);
+
+    IndexScalarQuantizer();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void reset() override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    DistanceComputer* get_distance_computer() const override;
+
+    /* standalone codec interface */
+    size_t sa_code_size() const override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+/** An IVF implementation where the components of the residuals are
+ * encoded with a scalar quantizer. All distance computations
+ * are asymmetric, so the encoded vectors are decoded and approximate
+ * distances are computed.
+ */
+
+struct IndexIVFScalarQuantizer : IndexIVF {
+    ScalarQuantizer sq;
+    bool by_residual;
+
+    IndexIVFScalarQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            ScalarQuantizer::QuantizerType qtype,
+            MetricType metric = METRIC_L2,
+            bool encode_residual = true);
+
+    IndexIVFScalarQuantizer();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    /* standalone codec interface */
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/IndexShards.h
+++ b/src/3rdlib/faiss/IndexShards.h
@ -0,0 +1,111 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/**
+ * Index that concatenates the results from several sub-indexes
+ */
+template <typename IndexT>
+struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    /**
+     * The dimension that all sub-indices must share will be the dimension of
+     * the first sub-index added
+     *
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            bool threaded = false,
+            bool successive_ids = true);
+
+    /**
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            idx_t d,
+            bool threaded = false,
+            bool successive_ids = true);
+
+    /// int version due to the implicit bool conversion ambiguity of int as
+    /// dimension
+    explicit IndexShardsTemplate(
+            int d,
+            bool threaded = false,
+            bool successive_ids = true);
+
+    /// Alias for addIndex()
+    void add_shard(IndexT* index) {
+        this->addIndex(index);
+    }
+
+    /// Alias for removeIndex()
+    void remove_shard(IndexT* index) {
+        this->removeIndex(index);
+    }
+
+    /// supported only for sub-indices that implement add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    /**
+     * Cases (successive_ids, xids):
+     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+     *                        request them to be shifted
+     * - true, NULL           OK, but should be called only once (calls add()
+     *                        on sub-indexes).
+     * - false, non-NULL      OK: will call add_with_ids with passed in xids
+     *                        distributed evenly over shards
+     * - false, NULL          OK: will call add_with_ids on each sub-index,
+     *                        starting at ntotal
+     */
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    bool successive_ids;
+
+    /// Synchronize the top-level index (IndexShards) with data in the
+    /// sub-indices
+    void syncWithSubIndexes();
+
+   protected:
+    /// Called just after an index is added
+    void onAfterAddIndex(IndexT* index) override;
+
+    /// Called just after an index is removed
+    void onAfterRemoveIndex(IndexT* index) override;
+};
+
+using IndexShards = IndexShardsTemplate<Index>;
+using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
+
+} // namespace faiss
--- a/src/3rdlib/faiss/MatrixStats.h
+++ b/src/3rdlib/faiss/MatrixStats.h
@ -0,0 +1,59 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace faiss {
+
+/** Reports some statistics on a dataset and comments on them.
+ *
+ * It is a class rather than a function so that all stats can also be
+ * accessed from code */
+
+struct MatrixStats {
+    MatrixStats(size_t n, size_t d, const float* x);
+    std::string comments;
+
+    // raw statistics
+    size_t n, d;
+    size_t n_collision, n_valid, n0;
+    double min_norm2, max_norm2;
+
+    struct PerDimStats {
+        size_t n, n_nan, n_inf, n0;
+
+        float min, max;
+        double sum, sum2;
+
+        size_t n_valid;
+        double mean, stddev;
+
+        PerDimStats();
+        void add(float x);
+        void compute_mean_std();
+    };
+
+    std::vector<PerDimStats> per_dim_stats;
+    struct Occurrence {
+        size_t first;
+        size_t count;
+    };
+    std::unordered_map<uint64_t, Occurrence> occurrences;
+
+    char* buf;
+    size_t nbuf;
+    void do_comment(const char* fmt, ...);
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/MetaIndexes.h
+++ b/src/3rdlib/faiss/MetaIndexes.h
@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef META_INDEXES_H
+#define META_INDEXES_H
+
+#include <faiss/Index.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexShards.h>
+#include <unordered_map>
+#include <vector>
+
+namespace faiss {
+
+/** Index that translates search results to ids */
+template <typename IndexT>
+struct IndexIDMapTemplate : IndexT {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    IndexT* index;   ///! the sub-index
+    bool own_fields; ///! whether pointers are deleted in destructo
+    std::vector<idx_t> id_map;
+
+    explicit IndexIDMapTemplate(IndexT* index);
+
+    /// @param xids if non-null, ids to store for the vectors (size n)
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    /// this will fail. Use add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void reset() override;
+
+    /// remove ids adapted to IndexFlat
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void range_search(
+            idx_t n,
+            const component_t* x,
+            distance_t radius,
+            RangeSearchResult* result) const override;
+
+    ~IndexIDMapTemplate() override;
+    IndexIDMapTemplate() {
+        own_fields = false;
+        index = nullptr;
+    }
+};
+
+using IndexIDMap = IndexIDMapTemplate<Index>;
+using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
+
+/** same as IndexIDMap but also provides an efficient reconstruction
+ *  implementation via a 2-way index */
+template <typename IndexT>
+struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    std::unordered_map<idx_t, idx_t> rev_map;
+
+    explicit IndexIDMap2Template(IndexT* index);
+
+    /// make the rev_map from scratch
+    void construct_rev_map();
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void reconstruct(idx_t key, component_t* recons) const override;
+
+    ~IndexIDMap2Template() override {}
+    IndexIDMap2Template() {}
+};
+
+using IndexIDMap2 = IndexIDMap2Template<Index>;
+using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
+
+/** splits input vectors in segments and assigns each segment to a sub-index
+ * used to distribute a MultiIndexQuantizer
+ */
+struct IndexSplitVectors : Index {
+    bool own_fields;
+    bool threaded;
+    std::vector<Index*> sub_indexes;
+    idx_t sum_d; /// sum of dimensions seen so far
+
+    explicit IndexSplitVectors(idx_t d, bool threaded = false);
+
+    void add_sub_index(Index*);
+    void sync_with_sub_indexes();
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    ~IndexSplitVectors() override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/MetricType.h
+++ b/src/3rdlib/faiss/MetricType.h
@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_METRIC_TYPE_H
+#define FAISS_METRIC_TYPE_H
+
+namespace faiss {
+
+/// The metric space for vector comparison for Faiss indices and algorithms.
+///
+/// Most algorithms support both inner product and L2, with the flat
+/// (brute-force) indices supporting additional metric types for vector
+/// comparison.
+enum MetricType {
+    METRIC_INNER_PRODUCT = 0, ///< maximum inner product search
+    METRIC_L2 = 1,            ///< squared L2 search
+    METRIC_L1,                ///< L1 (aka cityblock)
+    METRIC_Linf,              ///< infinity distance
+    METRIC_Lp,                ///< L_p distance, p is given by a faiss::Index
+                              /// metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/VectorTransform.h
+++ b/src/3rdlib/faiss/VectorTransform.h
@ -0,0 +1,294 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_VECTOR_TRANSFORM_H
+#define FAISS_VECTOR_TRANSFORM_H
+
+/** Defines a few objects that apply transformations to a set of
+ * vectors Often these are pre-processing steps.
+ */
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/** Any transformation applied on a set of vectors */
+struct VectorTransform {
+    typedef Index::idx_t idx_t;
+
+    int d_in;  ///! input dimension
+    int d_out; ///! output dimension
+
+    explicit VectorTransform(int d_in = 0, int d_out = 0)
+            : d_in(d_in), d_out(d_out), is_trained(true) {}
+
+    /// set if the VectorTransform does not require training, or if
+    /// training is done already
+    bool is_trained;
+
+    /** Perform training on a representative set of vectors. Does
+     * nothing by default.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+
+    /** apply the random rotation, return new allocated matrix
+     * @param     x size n * d_in
+     * @return    size n * d_out
+     */
+    float* apply(idx_t n, const float* x) const;
+
+    /// same as apply, but result is pre-allocated
+    virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
+
+    /// reverse transformation. May not be implemented or may return
+    /// approximate result
+    virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
+
+    virtual ~VectorTransform() {}
+};
+
+/** Generic linear transformation, with bias term applied on output
+ * y = A * x + b
+ */
+struct LinearTransform : VectorTransform {
+    bool have_bias; ///! whether to use the bias term
+
+    /// check if matrix A is orthonormal (enables reverse_transform)
+    bool is_orthonormal;
+
+    /// Transformation matrix, size d_out * d_in
+    std::vector<float> A;
+
+    /// bias vector, size d_out
+    std::vector<float> b;
+
+    /// both d_in > d_out and d_out < d_in are supported
+    explicit LinearTransform(
+            int d_in = 0,
+            int d_out = 0,
+            bool have_bias = false);
+
+    /// same as apply, but result is pre-allocated
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// compute x = A^T * (x - b)
+    /// is reverse transform if A has orthonormal lines
+    void transform_transpose(idx_t n, const float* y, float* x) const;
+
+    /// works only if is_orthonormal
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    /// compute A^T * A to set the is_orthonormal flag
+    void set_is_orthonormal();
+
+    bool verbose;
+    void print_if_verbose(
+            const char* name,
+            const std::vector<double>& mat,
+            int n,
+            int d) const;
+
+    ~LinearTransform() override {}
+};
+
+/// Randomly rotate a set of vectors
+struct RandomRotationMatrix : LinearTransform {
+    /// both d_in > d_out and d_out < d_in are supported
+    RandomRotationMatrix(int d_in, int d_out)
+            : LinearTransform(d_in, d_out, false) {}
+
+    /// must be called before the transform is used
+    void init(int seed);
+
+    // intializes with an arbitrary seed
+    void train(idx_t n, const float* x) override;
+
+    RandomRotationMatrix() {}
+};
+
+/** Applies a principal component analysis on a set of vectors,
+ *  with optionally whitening and random rotation. */
+struct PCAMatrix : LinearTransform {
+    /** after transformation the components are multiplied by
+     * eigenvalues^eigen_power
+     *
+     * =0: no whitening
+     * =-0.5: full whitening
+     */
+    float eigen_power;
+
+    /// random rotation after PCA
+    bool random_rotation;
+
+    /// ratio between # training vectors and dimension
+    size_t max_points_per_d;
+
+    /// try to distribute output eigenvectors in this many bins
+    int balanced_bins;
+
+    /// Mean, size d_in
+    std::vector<float> mean;
+
+    /// eigenvalues of covariance matrix (= squared singular values)
+    std::vector<float> eigenvalues;
+
+    /// PCA matrix, size d_in * d_in
+    std::vector<float> PCAMat;
+
+    // the final matrix is computed after random rotation and/or whitening
+    explicit PCAMatrix(
+            int d_in = 0,
+            int d_out = 0,
+            float eigen_power = 0,
+            bool random_rotation = false);
+
+    /// train on n vectors. If n < d_in then the eigenvector matrix
+    /// will be completed with 0s
+    void train(idx_t n, const float* x) override;
+
+    /// copy pre-trained PCA matrix
+    void copy_from(const PCAMatrix& other);
+
+    /// called after mean, PCAMat and eigenvalues are computed
+    void prepare_Ab();
+};
+
+/** ITQ implementation from
+ *
+ *     Iterative quantization: A procrustean approach to learning binary codes
+ *     for large-scale image retrieval,
+ *
+ * Yunchao Gong, Svetlana Lazebnik, Albert Gordo, Florent Perronnin,
+ * PAMI'12.
+ */
+
+struct ITQMatrix : LinearTransform {
+    int max_iter;
+    int seed;
+
+    // force initialization of the rotation (for debugging)
+    std::vector<double> init_rotation;
+
+    explicit ITQMatrix(int d = 0);
+
+    void train(idx_t n, const float* x) override;
+};
+
+/** The full ITQ transform, including normalizations and PCA transformation
+ */
+struct ITQTransform : VectorTransform {
+    std::vector<float> mean;
+    bool do_pca;
+    ITQMatrix itq;
+
+    /// max training points per dimension
+    int max_train_per_dim;
+
+    // concatenation of PCA + ITQ transformation
+    LinearTransform pca_then_itq;
+
+    explicit ITQTransform(int d_in = 0, int d_out = 0, bool do_pca = false);
+
+    void train(idx_t n, const float* x) override;
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+};
+
+struct ProductQuantizer;
+
+/** Applies a rotation to align the dimensions with a PQ to minimize
+ *  the reconstruction error. Can be used before an IndexPQ or an
+ *  IndexIVFPQ. The method is the non-parametric version described in:
+ *
+ * "Optimized Product Quantization for Approximate Nearest Neighbor Search"
+ * Tiezheng Ge, Kaiming He, Qifa Ke, Jian Sun, CVPR'13
+ *
+ */
+struct OPQMatrix : LinearTransform {
+    int M;          ///< nb of subquantizers
+    int niter;      ///< Number of outer training iterations
+    int niter_pq;   ///< Number of training iterations for the PQ
+    int niter_pq_0; ///< same, for the first outer iteration
+
+    /// if there are too many training points, resample
+    size_t max_train_points;
+    bool verbose;
+
+    /// if non-NULL, use this product quantizer for training
+    /// should be constructed with (d_out, M, _)
+    ProductQuantizer* pq;
+
+    /// if d2 != -1, output vectors of this dimension
+    explicit OPQMatrix(int d = 0, int M = 1, int d2 = -1);
+
+    void train(idx_t n, const float* x) override;
+};
+
+/** remap dimensions for intput vectors, possibly inserting 0s
+ * strictly speaking this is also a linear transform but we don't want
+ * to compute it with matrix multiplies */
+struct RemapDimensionsTransform : VectorTransform {
+    /// map from output dimension to input, size d_out
+    /// -1 -> set output to 0
+    std::vector<int> map;
+
+    RemapDimensionsTransform(int d_in, int d_out, const int* map);
+
+    /// remap input to output, skipping or inserting dimensions as needed
+    /// if uniform: distribute dimensions uniformly
+    /// otherwise just take the d_out first ones.
+    RemapDimensionsTransform(int d_in, int d_out, bool uniform = true);
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// reverse transform correct only when the mapping is a permutation
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    RemapDimensionsTransform() {}
+};
+
+/** per-vector normalization */
+struct NormalizationTransform : VectorTransform {
+    float norm;
+
+    explicit NormalizationTransform(int d, float norm = 2.0);
+    NormalizationTransform();
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// Identity transform since norm is not revertible
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+};
+
+/** Subtract the mean of each component from the vectors. */
+struct CenteringTransform : VectorTransform {
+    /// Mean, size d_in = d_out
+    std::vector<float> mean;
+
+    explicit CenteringTransform(int d = 0);
+
+    /// train on n vectors.
+    void train(idx_t n, const float* x) override;
+
+    /// subtract the mean
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// add the mean
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/clone_index.h
+++ b/src/3rdlib/faiss/clone_index.h
@ -0,0 +1,33 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// I/O code for indexes
+
+#pragma once
+
+namespace faiss {
+
+struct Index;
+struct IndexIVF;
+struct VectorTransform;
+
+/* cloning functions */
+Index* clone_index(const Index*);
+
+/** Cloner class, useful to override classes with other cloning
+ * functions. The cloning function above just calls
+ * Cloner::clone_Index. */
+struct Cloner {
+    virtual VectorTransform* clone_VectorTransform(const VectorTransform*);
+    virtual Index* clone_Index(const Index*);
+    virtual IndexIVF* clone_IndexIVF(const IndexIVF*);
+    virtual ~Cloner() {}
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/AdditiveQuantizer.h
+++ b/src/3rdlib/faiss/impl/AdditiveQuantizer.h
@ -0,0 +1,164 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/** Abstract structure for additive quantizers
+ *
+ * Different from the product quantizer in which the decoded vector is the
+ * concatenation of M sub-vectors, additive quantizers sum M sub-vectors
+ * to get the decoded vector.
+ */
+struct AdditiveQuantizer {
+    size_t d;                     ///< size of the input vectors
+    size_t M;                     ///< number of codebooks
+    std::vector<size_t> nbits;    ///< bits for each step
+    std::vector<float> codebooks; ///< codebooks
+
+    // derived values
+    std::vector<uint64_t> codebook_offsets;
+    size_t code_size;           ///< code size in bytes
+    size_t tot_bits;            ///< total number of bits
+    size_t total_codebook_size; ///< size of the codebook in vectors
+    bool only_8bit;             ///< are all nbits = 8 (use faster decoder)
+
+    bool verbose;    ///< verbose during training?
+    bool is_trained; ///< is trained or not
+
+    /// Encodes how search is performed and how vectors are encoded
+    enum Search_type_t {
+        ST_decompress,    ///< decompress database vector
+        ST_LUT_nonorm,    ///< use a LUT, don't include norms (OK for IP or
+                          ///< normalized vectors)
+        ST_norm_from_LUT, ///< compute the norms from the look-up tables (cost
+                          ///< is in O(M^2))
+        ST_norm_float, ///< use a LUT, and store float32 norm with the vectors
+        ST_norm_qint8, ///< use a LUT, and store 8bit-quantized norm
+        ST_norm_qint4,
+    };
+
+    AdditiveQuantizer(
+            size_t d,
+            const std::vector<size_t>& nbits,
+            Search_type_t search_type = ST_decompress);
+
+    AdditiveQuantizer();
+
+    ///< compute derived values when d, M and nbits have been set
+    void set_derived_values();
+
+    ///< Train the additive quantizer
+    virtual void train(size_t n, const float* x) = 0;
+
+    /** Encode a set of vectors
+     *
+     * @param x      vectors to encode, size n * d
+     * @param codes  output codes, size n * code_size
+     */
+    virtual void compute_codes(const float* x, uint8_t* codes, size_t n)
+            const = 0;
+
+    /** pack a series of code to bit-compact format
+     *
+     * @param codes        codes to be packed, size n * code_size
+     * @param packed_codes output bit-compact codes
+     * @param ld_codes     leading dimension of codes
+     * @param norms        norms of the vectors (size n). Will be computed if
+     *                     needed but not provided
+     */
+    void pack_codes(
+            size_t n,
+            const int32_t* codes,
+            uint8_t* packed_codes,
+            int64_t ld_codes = -1,
+            const float* norms = nullptr) const;
+
+    /** Decode a set of vectors
+     *
+     * @param codes  codes to decode, size n * code_size
+     * @param x      output vectors, size n * d
+     */
+    void decode(const uint8_t* codes, float* x, size_t n) const;
+
+    /** Decode a set of vectors in non-packed format
+     *
+     * @param codes  codes to decode, size n * ld_codes
+     * @param x      output vectors, size n * d
+     */
+    void decode_unpacked(
+            const int32_t* codes,
+            float* x,
+            size_t n,
+            int64_t ld_codes = -1) const;
+
+    /****************************************************************************
+     * Search functions in an external set of codes.
+     ****************************************************************************/
+
+    /// Also determines what's in the codes
+    Search_type_t search_type;
+
+    /// min/max for quantization of norms
+    float norm_min, norm_max;
+
+    template <bool is_IP, Search_type_t effective_search_type>
+    float compute_1_distance_LUT(const uint8_t* codes, const float* LUT) const;
+
+    /*
+        float compute_1_L2sqr(const uint8_t* codes, const float* LUT);
+    */
+    /****************************************************************************
+     * Support for exhaustive distance computations with all the centroids.
+     * Hence, the number of these centroids should not be too large.
+     ****************************************************************************/
+    using idx_t = Index::idx_t;
+
+    /// decoding function for a code in a 64-bit word
+    void decode_64bit(idx_t n, float* x) const;
+
+    /** Compute inner-product look-up tables. Used in the centroid search
+     * functions.
+     *
+     * @param xq     query vector, size (n, d)
+     * @param LUT    look-up table, size (n, total_codebook_size)
+     */
+    void compute_LUT(size_t n, const float* xq, float* LUT) const;
+
+    /// exact IP search
+    void knn_centroids_inner_product(
+            idx_t n,
+            const float* xq,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+
+    /** For L2 search we need the L2 norms of the centroids
+     *
+     * @param norms    output norms table, size total_codebook_size
+     */
+    void compute_centroid_norms(float* norms) const;
+
+    /** Exact L2 search, with precomputed norms */
+    void knn_centroids_L2(
+            idx_t n,
+            const float* xq,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const float* centroid_norms) const;
+
+    virtual ~AdditiveQuantizer();
+};
+
+}; // namespace faiss
--- a/src/3rdlib/faiss/impl/AuxIndexStructures.h
+++ b/src/3rdlib/faiss/impl/AuxIndexStructures.h
@ -0,0 +1,276 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// Auxiliary index structures, that are used in indexes but that can
+// be forward-declared
+
+#ifndef FAISS_AUX_INDEX_STRUCTURES_H
+#define FAISS_AUX_INDEX_STRUCTURES_H
+
+#include <stdint.h>
+
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <unordered_set>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/** The objective is to have a simple result structure while
+ *  minimizing the number of mem copies in the result. The method
+ *  do_allocation can be overloaded to allocate the result tables in
+ *  the matrix type of a scripting language like Lua or Python. */
+struct RangeSearchResult {
+    size_t nq;    ///< nb of queries
+    size_t* lims; ///< size (nq + 1)
+
+    typedef Index::idx_t idx_t;
+
+    idx_t* labels;    ///< result for query i is labels[lims[i]:lims[i+1]]
+    float* distances; ///< corresponding distances (not sorted)
+
+    size_t buffer_size; ///< size of the result buffers used
+
+    /// lims must be allocated on input to range_search.
+    explicit RangeSearchResult(idx_t nq, bool alloc_lims = true);
+
+    /// called when lims contains the nb of elements result entries
+    /// for each query
+
+    virtual void do_allocation();
+
+    virtual ~RangeSearchResult();
+};
+
+/** Encapsulates a set of ids to remove. */
+struct IDSelector {
+    typedef Index::idx_t idx_t;
+    virtual bool is_member(idx_t id) const = 0;
+    virtual ~IDSelector() {}
+};
+
+/** remove ids between [imni, imax) */
+struct IDSelectorRange : IDSelector {
+    idx_t imin, imax;
+
+    IDSelectorRange(idx_t imin, idx_t imax);
+    bool is_member(idx_t id) const override;
+    ~IDSelectorRange() override {}
+};
+
+/** simple list of elements to remove
+ *
+ * this is inefficient in most cases, except for IndexIVF with
+ * maintain_direct_map
+ */
+struct IDSelectorArray : IDSelector {
+    size_t n;
+    const idx_t* ids;
+
+    IDSelectorArray(size_t n, const idx_t* ids);
+    bool is_member(idx_t id) const override;
+    ~IDSelectorArray() override {}
+};
+
+/** Remove ids from a set. Repetitions of ids in the indices set
+ * passed to the constructor does not hurt performance. The hash
+ * function used for the bloom filter and GCC's implementation of
+ * unordered_set are just the least significant bits of the id. This
+ * works fine for random ids or ids in sequences but will produce many
+ * hash collisions if lsb's are always the same */
+struct IDSelectorBatch : IDSelector {
+    std::unordered_set<idx_t> set;
+
+    typedef unsigned char uint8_t;
+    std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
+    int nbits;
+    idx_t mask;
+
+    IDSelectorBatch(size_t n, const idx_t* indices);
+    bool is_member(idx_t id) const override;
+    ~IDSelectorBatch() override {}
+};
+
+/****************************************************************
+ * Result structures for range search.
+ *
+ * The main constraint here is that we want to support parallel
+ * queries from different threads in various ways: 1 thread per query,
+ * several threads per query. We store the actual results in blocks of
+ * fixed size rather than exponentially increasing memory. At the end,
+ * we copy the block content to a linear result array.
+ *****************************************************************/
+
+/** List of temporary buffers used to store results before they are
+ *  copied to the RangeSearchResult object. */
+struct BufferList {
+    typedef Index::idx_t idx_t;
+
+    // buffer sizes in # entries
+    size_t buffer_size;
+
+    struct Buffer {
+        idx_t* ids;
+        float* dis;
+    };
+
+    std::vector<Buffer> buffers;
+    size_t wp; ///< write pointer in the last buffer.
+
+    explicit BufferList(size_t buffer_size);
+
+    ~BufferList();
+
+    /// create a new buffer
+    void append_buffer();
+
+    /// add one result, possibly appending a new buffer if needed
+    void add(idx_t id, float dis);
+
+    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
+    /// tables dest_ids, dest_dis
+    void copy_range(size_t ofs, size_t n, idx_t* dest_ids, float* dest_dis);
+};
+
+struct RangeSearchPartialResult;
+
+/// result structure for a single query
+struct RangeQueryResult {
+    using idx_t = Index::idx_t;
+    idx_t qno;   //< id of the query
+    size_t nres; //< nb of results for this query
+    RangeSearchPartialResult* pres;
+
+    /// called by search function to report a new result
+    void add(float dis, idx_t id);
+};
+
+/// the entries in the buffers are split per query
+struct RangeSearchPartialResult : BufferList {
+    RangeSearchResult* res;
+
+    /// eventually the result will be stored in res_in
+    explicit RangeSearchPartialResult(RangeSearchResult* res_in);
+
+    /// query ids + nb of results per query.
+    std::vector<RangeQueryResult> queries;
+
+    /// begin a new result
+    RangeQueryResult& new_result(idx_t qno);
+
+    /*****************************************
+     * functions used at the end of the search to merge the result
+     * lists */
+    void finalize();
+
+    /// called by range_search before do_allocation
+    void set_lims();
+
+    /// called by range_search after do_allocation
+    void copy_result(bool incremental = false);
+
+    /// merge a set of PartialResult's into one RangeSearchResult
+    /// on ouptut the partialresults are empty!
+    static void merge(
+            std::vector<RangeSearchPartialResult*>& partial_results,
+            bool do_delete = true);
+};
+
+/***********************************************************
+ * The distance computer maintains a current query and computes
+ * distances to elements in an index that supports random access.
+ *
+ * The DistanceComputer is not intended to be thread-safe (eg. because
+ * it maintains counters) so the distance functions are not const,
+ * instantiate one from each thread if needed.
+ ***********************************************************/
+struct DistanceComputer {
+    using idx_t = Index::idx_t;
+
+    /// called before computing distances. Pointer x should remain valid
+    /// while operator () is called
+    virtual void set_query(const float* x) = 0;
+
+    /// compute distance of vector i to current query
+    virtual float operator()(idx_t i) = 0;
+
+    /// compute distance between two stored vectors
+    virtual float symmetric_dis(idx_t i, idx_t j) = 0;
+
+    virtual ~DistanceComputer() {}
+};
+
+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+
+struct FAISS_API InterruptCallback {
+    virtual bool want_interrupt() = 0;
+    virtual ~InterruptCallback() {}
+
+    // lock that protects concurrent calls to is_interrupted
+    static std::mutex lock;
+
+    static std::unique_ptr<InterruptCallback> instance;
+
+    static void clear_instance();
+
+    /** check if:
+     * - an interrupt callback is set
+     * - the callback returns true
+     * if this is the case, then throw an exception. Should not be called
+     * from multiple threads.
+     */
+    static void check();
+
+    /// same as check() but return true if is interrupted instead of
+    /// throwing. Can be called from multiple threads.
+    static bool is_interrupted();
+
+    /** assuming each iteration takes a certain number of flops, what
+     * is a reasonable interval to check for interrupts?
+     */
+    static size_t get_period_hint(size_t flops);
+};
+
+/// set implementation optimized for fast access.
+struct VisitedTable {
+    std::vector<uint8_t> visited;
+    int visno;
+
+    explicit VisitedTable(int size) : visited(size), visno(1) {}
+
+    /// set flag #no to true
+    void set(int no) {
+        visited[no] = visno;
+    }
+
+    /// get flag #no
+    bool get(int no) const {
+        return visited[no] == visno;
+    }
+
+    /// reset all flags to false
+    void advance() {
+        visno++;
+        if (visno == 250) {
+            // 250 rather than 255 because sometimes we use visno and visno+1
+            memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
+            visno = 1;
+        }
+    }
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/impl/FaissAssert.h
+++ b/src/3rdlib/faiss/impl/FaissAssert.h
@ -0,0 +1,111 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_ASSERT_INCLUDED
+#define FAISS_ASSERT_INCLUDED
+
+#include <faiss/impl/FaissException.h>
+#include <faiss/impl/platform_macros.h>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+///
+/// Assertions
+///
+
+#define FAISS_ASSERT(X)                                  \
+    do {                                                 \
+        if (!(X)) {                                      \
+            fprintf(stderr,                              \
+                    "Faiss assertion '%s' failed in %s " \
+                    "at %s:%d\n",                        \
+                    #X,                                  \
+                    __PRETTY_FUNCTION__,                 \
+                    __FILE__,                            \
+                    __LINE__);                           \
+            abort();                                     \
+        }                                                \
+    } while (false)
+
+#define FAISS_ASSERT_MSG(X, MSG)                         \
+    do {                                                 \
+        if (!(X)) {                                      \
+            fprintf(stderr,                              \
+                    "Faiss assertion '%s' failed in %s " \
+                    "at %s:%d; details: " MSG "\n",      \
+                    #X,                                  \
+                    __PRETTY_FUNCTION__,                 \
+                    __FILE__,                            \
+                    __LINE__);                           \
+            abort();                                     \
+        }                                                \
+    } while (false)
+
+#define FAISS_ASSERT_FMT(X, FMT, ...)                    \
+    do {                                                 \
+        if (!(X)) {                                      \
+            fprintf(stderr,                              \
+                    "Faiss assertion '%s' failed in %s " \
+                    "at %s:%d; details: " FMT "\n",      \
+                    #X,                                  \
+                    __PRETTY_FUNCTION__,                 \
+                    __FILE__,                            \
+                    __LINE__,                            \
+                    __VA_ARGS__);                        \
+            abort();                                     \
+        }                                                \
+    } while (false)
+
+///
+/// Exceptions for returning user errors
+///
+
+#define FAISS_THROW_MSG(MSG)                                   \
+    do {                                                       \
+        throw faiss::FaissException(                           \
+                MSG, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+    } while (false)
+
+#define FAISS_THROW_FMT(FMT, ...)                              \
+    do {                                                       \
+        std::string __s;                                       \
+        int __size = snprintf(nullptr, 0, FMT, __VA_ARGS__);   \
+        __s.resize(__size + 1);                                \
+        snprintf(&__s[0], __s.size(), FMT, __VA_ARGS__);       \
+        throw faiss::FaissException(                           \
+                __s, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+    } while (false)
+
+///
+/// Exceptions thrown upon a conditional failure
+///
+
+#define FAISS_THROW_IF_NOT(X)                          \
+    do {                                               \
+        if (!(X)) {                                    \
+            FAISS_THROW_FMT("Error: '%s' failed", #X); \
+        }                                              \
+    } while (false)
+
+#define FAISS_THROW_IF_NOT_MSG(X, MSG)                       \
+    do {                                                     \
+        if (!(X)) {                                          \
+            FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
+        }                                                    \
+    } while (false)
+
+#define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                               \
+    do {                                                                  \
+        if (!(X)) {                                                       \
+            FAISS_THROW_FMT("Error: '%s' failed: " FMT, #X, __VA_ARGS__); \
+        }                                                                 \
+    } while (false)
+
+#endif
--- a/src/3rdlib/faiss/impl/FaissException.h
+++ b/src/3rdlib/faiss/impl/FaissException.h
@ -0,0 +1,87 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_EXCEPTION_INCLUDED
+#define FAISS_EXCEPTION_INCLUDED
+
+#include <exception>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace faiss {
+
+/// Base class for Faiss exceptions
+class FaissException : public std::exception {
+   public:
+    explicit FaissException(const std::string& msg);
+
+    FaissException(
+            const std::string& msg,
+            const char* funcName,
+            const char* file,
+            int line);
+
+    /// from std::exception
+    const char* what() const noexcept override;
+
+    std::string msg;
+};
+
+/// Handle multiple exceptions from worker threads, throwing an appropriate
+/// exception that aggregates the information
+/// The pair int is the thread that generated the exception
+void handleExceptions(
+        std::vector<std::pair<int, std::exception_ptr>>& exceptions);
+
+/** bare-bones unique_ptr
+ * this one deletes with delete [] */
+template <class T>
+struct ScopeDeleter {
+    const T* ptr;
+    explicit ScopeDeleter(const T* ptr = nullptr) : ptr(ptr) {}
+    void release() {
+        ptr = nullptr;
+    }
+    void set(const T* ptr_in) {
+        ptr = ptr_in;
+    }
+    void swap(ScopeDeleter<T>& other) {
+        std::swap(ptr, other.ptr);
+    }
+    ~ScopeDeleter() {
+        delete[] ptr;
+    }
+};
+
+/** same but deletes with the simple delete (least common case) */
+template <class T>
+struct ScopeDeleter1 {
+    const T* ptr;
+    explicit ScopeDeleter1(const T* ptr = nullptr) : ptr(ptr) {}
+    void release() {
+        ptr = nullptr;
+    }
+    void set(const T* ptr_in) {
+        ptr = ptr_in;
+    }
+    void swap(ScopeDeleter1<T>& other) {
+        std::swap(ptr, other.ptr);
+    }
+    ~ScopeDeleter1() {
+        delete ptr;
+    }
+};
+
+/// make typeids more readable
+std::string demangle_cpp_symbol(const char* name);
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/impl/HNSW.h
+++ b/src/3rdlib/faiss/impl/HNSW.h
@ -0,0 +1,262 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <queue>
+#include <unordered_set>
+#include <vector>
+
+#include <omp.h>
+
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/random.h>
+
+namespace faiss {
+
+/** Implementation of the Hierarchical Navigable Small World
+ * datastructure.
+ *
+ * Efficient and robust approximate nearest neighbor search using
+ * Hierarchical Navigable Small World graphs
+ *
+ *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
+ *
+ * This implementation is heavily influenced by the NMSlib
+ * implementation by Yury Malkov and Leonid Boystov
+ * (https://github.com/searchivarius/nmslib)
+ *
+ * The HNSW object stores only the neighbor link structure, see
+ * IndexHNSW.h for the full index object.
+ */
+
+struct VisitedTable;
+struct DistanceComputer; // from AuxIndexStructures
+struct HNSWStats;
+
+struct HNSW {
+    /// internal storage of vectors (32 bits: this is expensive)
+    typedef int storage_idx_t;
+
+    /// Faiss results are 64-bit
+    typedef Index::idx_t idx_t;
+
+    typedef std::pair<float, storage_idx_t> Node;
+
+    /** Heap structure that allows fast
+     */
+    struct MinimaxHeap {
+        int n;
+        int k;
+        int nvalid;
+
+        std::vector<storage_idx_t> ids;
+        std::vector<float> dis;
+        typedef faiss::CMax<float, storage_idx_t> HC;
+
+        explicit MinimaxHeap(int n) : n(n), k(0), nvalid(0), ids(n), dis(n) {}
+
+        void push(storage_idx_t i, float v);
+
+        float max() const;
+
+        int size() const;
+
+        void clear();
+
+        int pop_min(float* vmin_out = nullptr);
+
+        int count_below(float thresh);
+    };
+
+    /// to sort pairs of (id, distance) from nearest to fathest or the reverse
+    struct NodeDistCloser {
+        float d;
+        int id;
+        NodeDistCloser(float d, int id) : d(d), id(id) {}
+        bool operator<(const NodeDistCloser& obj1) const {
+            return d < obj1.d;
+        }
+    };
+
+    struct NodeDistFarther {
+        float d;
+        int id;
+        NodeDistFarther(float d, int id) : d(d), id(id) {}
+        bool operator<(const NodeDistFarther& obj1) const {
+            return d > obj1.d;
+        }
+    };
+
+    /// assignment probability to each layer (sum=1)
+    std::vector<double> assign_probas;
+
+    /// number of neighbors stored per layer (cumulative), should not
+    /// be changed after first add
+    std::vector<int> cum_nneighbor_per_level;
+
+    /// level of each vector (base level = 1), size = ntotal
+    std::vector<int> levels;
+
+    /// offsets[i] is the offset in the neighbors array where vector i is stored
+    /// size ntotal + 1
+    std::vector<size_t> offsets;
+
+    /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
+    /// for all levels. this is where all storage goes.
+    std::vector<storage_idx_t> neighbors;
+
+    /// entry point in the search structure (one of the points with maximum
+    /// level
+    storage_idx_t entry_point;
+
+    faiss::RandomGenerator rng;
+
+    /// maximum level
+    int max_level;
+
+    /// expansion factor at construction time
+    int efConstruction;
+
+    /// expansion factor at search time
+    int efSearch;
+
+    /// during search: do we check whether the next best distance is good
+    /// enough?
+    bool check_relative_distance = true;
+
+    /// number of entry points in levels > 0.
+    int upper_beam;
+
+    /// use bounded queue during exploration
+    bool search_bounded_queue = true;
+
+    // methods that initialize the tree sizes
+
+    /// initialize the assign_probas and cum_nneighbor_per_level to
+    /// have 2*M links on level 0 and M links on levels > 0
+    void set_default_probas(int M, float levelMult);
+
+    /// set nb of neighbors for this level (before adding anything)
+    void set_nb_neighbors(int level_no, int n);
+
+    // methods that access the tree sizes
+
+    /// nb of neighbors for this level
+    int nb_neighbors(int layer_no) const;
+
+    /// cumumlative nb up to (and excluding) this level
+    int cum_nb_neighbors(int layer_no) const;
+
+    /// range of entries in the neighbors table of vertex no at layer_no
+    void neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end)
+            const;
+
+    /// only mandatory parameter: nb of neighbors
+    explicit HNSW(int M = 32);
+
+    /// pick a random level for a new point
+    int random_level();
+
+    /// add n random levels to table (for debugging...)
+    void fill_with_random_links(size_t n);
+
+    void add_links_starting_from(
+            DistanceComputer& ptdis,
+            storage_idx_t pt_id,
+            storage_idx_t nearest,
+            float d_nearest,
+            int level,
+            omp_lock_t* locks,
+            VisitedTable& vt);
+
+    /** add point pt_id on all levels <= pt_level and build the link
+     * structure for them. */
+    void add_with_locks(
+            DistanceComputer& ptdis,
+            int pt_level,
+            int pt_id,
+            std::vector<omp_lock_t>& locks,
+            VisitedTable& vt);
+
+    int search_from_candidates(
+            DistanceComputer& qdis,
+            int k,
+            idx_t* I,
+            float* D,
+            MinimaxHeap& candidates,
+            VisitedTable& vt,
+            HNSWStats& stats,
+            int level,
+            int nres_in = 0) const;
+
+    std::priority_queue<Node> search_from_candidate_unbounded(
+            const Node& node,
+            DistanceComputer& qdis,
+            int ef,
+            VisitedTable* vt,
+            HNSWStats& stats) const;
+
+    /// search interface
+    HNSWStats search(
+            DistanceComputer& qdis,
+            int k,
+            idx_t* I,
+            float* D,
+            VisitedTable& vt) const;
+
+    void reset();
+
+    void clear_neighbor_tables(int level);
+    void print_neighbor_stats(int level) const;
+
+    int prepare_level_tab(size_t n, bool preset_levels = false);
+
+    static void shrink_neighbor_list(
+            DistanceComputer& qdis,
+            std::priority_queue<NodeDistFarther>& input,
+            std::vector<NodeDistFarther>& output,
+            int max_size);
+};
+
+struct HNSWStats {
+    size_t n1, n2, n3;
+    size_t ndis;
+    size_t nreorder;
+
+    HNSWStats(
+            size_t n1 = 0,
+            size_t n2 = 0,
+            size_t n3 = 0,
+            size_t ndis = 0,
+            size_t nreorder = 0)
+            : n1(n1), n2(n2), n3(n3), ndis(ndis), nreorder(nreorder) {}
+
+    void reset() {
+        n1 = n2 = n3 = 0;
+        ndis = 0;
+        nreorder = 0;
+    }
+
+    void combine(const HNSWStats& other) {
+        n1 += other.n1;
+        n2 += other.n2;
+        n3 += other.n3;
+        ndis += other.ndis;
+        nreorder += other.nreorder;
+    }
+};
+
+// global var that collects them all
+FAISS_API extern HNSWStats hnsw_stats;
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/LocalSearchQuantizer.h
+++ b/src/3rdlib/faiss/impl/LocalSearchQuantizer.h
@ -0,0 +1,180 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <faiss/impl/AdditiveQuantizer.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/** Implementation of LSQ/LSQ++ described in the following two papers:
+ *
+ * Revisiting additive quantization
+ * Julieta Martinez, et al. ECCV 2016
+ *
+ * LSQ++: Lower running time and higher recall in multi-codebook quantization
+ * Julieta Martinez, et al. ECCV 2018
+ *
+ * This implementation is mostly translated from the Julia implementations
+ * by Julieta Martinez:
+ * (https://github.com/una-dinosauria/local-search-quantization,
+ *  https://github.com/una-dinosauria/Rayuela.jl)
+ *
+ * The trained codes are stored in `codebooks` which is called
+ * `centroids` in PQ and RQ.
+ */
+
+struct LocalSearchQuantizer : AdditiveQuantizer {
+    size_t K; ///< number of codes per codebook
+
+    size_t train_iters; ///< number of iterations in training
+
+    size_t encode_ils_iters; ///< iterations of local search in encoding
+    size_t train_ils_iters;  ///< iterations of local search in training
+    size_t icm_iters;        ///< number of iterations in icm
+
+    float p;     ///< temperature factor
+    float lambd; ///< regularization factor
+
+    size_t chunk_size; ///< nb of vectors to encode at a time
+
+    int random_seed; ///< seed for random generator
+    size_t nperts;   ///< number of perturbation in each code
+
+    bool update_codebooks_with_double = true;
+
+    LocalSearchQuantizer(
+            size_t d,     /* dimensionality of the input vectors */
+            size_t M,     /* number of subquantizers */
+            size_t nbits, /* number of bit per subvector index */
+            Search_type_t search_type =
+                    ST_decompress /* determines the storage type */
+    );
+
+    LocalSearchQuantizer();
+
+    // Train the local search quantizer
+    void train(size_t n, const float* x) override;
+
+    /** Encode a set of vectors
+     *
+     * @param x      vectors to encode, size n * d
+     * @param codes  output codes, size n * code_size
+     */
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
+
+    /** Update codebooks given encodings
+     *
+     * @param x      training vectors, size n * d
+     * @param codes  encoded training vectors, size n * M
+     */
+    void update_codebooks(const float* x, const int32_t* codes, size_t n);
+
+    /** Encode vectors given codebooks using iterative conditional mode (icm).
+     *
+     * @param x      vectors to encode, size n * d
+     * @param codes  output codes, size n * M
+     * @param ils_iters number of iterations of iterative local search
+     */
+    void icm_encode(
+            const float* x,
+            int32_t* codes,
+            size_t n,
+            size_t ils_iters,
+            std::mt19937& gen) const;
+
+    void icm_encode_partial(
+            size_t index,
+            const float* x,
+            int32_t* codes,
+            size_t n,
+            const float* binaries,
+            size_t ils_iters,
+            std::mt19937& gen) const;
+
+    void icm_encode_step(
+            const float* unaries,
+            const float* binaries,
+            int32_t* codes,
+            size_t n) const;
+
+    /** Add some perturbation to codebooks
+     *
+     * @param T         temperature of simulated annealing
+     * @param stddev    standard derivations of each dimension in training data
+     */
+    void perturb_codebooks(
+            float T,
+            const std::vector<float>& stddev,
+            std::mt19937& gen);
+
+    /** Add some perturbation to codes
+     *
+     * @param codes codes to be perturbed, size n * M
+     */
+    void perturb_codes(int32_t* codes, size_t n, std::mt19937& gen) const;
+
+    /** Compute binary terms
+     *
+     * @param binaries binary terms, size M * M * K * K
+     */
+    void compute_binary_terms(float* binaries) const;
+
+    /** Compute unary terms
+     *
+     * @param x       vectors to encode, size n * d
+     * @param unaries unary terms, size n * M * K
+     */
+    void compute_unary_terms(const float* x, float* unaries, size_t n) const;
+
+    /** Helper function to compute reconstruction error
+     *
+     * @param x     vectors to encode, size n * d
+     * @param codes encoded codes, size n * M
+     * @param objs  if it is not null, store reconstruction
+                    error of each vector into it, size n
+     */
+    float evaluate(
+            const int32_t* codes,
+            const float* x,
+            size_t n,
+            float* objs = nullptr) const;
+};
+
+/** A helper struct to count consuming time during training.
+ *  It is NOT thread-safe.
+ */
+struct LSQTimer {
+    std::unordered_map<std::string, double> duration;
+    std::unordered_map<std::string, double> t0;
+    std::unordered_map<std::string, bool> started;
+
+    LSQTimer() {
+        reset();
+    }
+
+    double get(const std::string& name);
+
+    void start(const std::string& name);
+
+    void end(const std::string& name);
+
+    void reset();
+};
+
+FAISS_API extern LSQTimer lsq_timer; ///< timer to count consuming time
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/NNDescent.h
+++ b/src/3rdlib/faiss/impl/NNDescent.h
@ -0,0 +1,154 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <algorithm>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+#include <omp.h>
+
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/random.h>
+
+namespace faiss {
+
+/** Implementation of NNDescent which is one of the most popular
+ *  KNN graph building algorithms
+ *
+ * Efficient K-Nearest Neighbor Graph Construction for Generic
+ * Similarity Measures
+ *
+ *  Dong, Wei, Charikar Moses, and Kai Li, WWW 2011
+ *
+ * This implmentation is heavily influenced by the efanna
+ * implementation by Cong Fu and the KGraph library by Wei Dong
+ * (https://github.com/ZJULearning/efanna_graph)
+ * (https://github.com/aaalgo/kgraph)
+ *
+ * The NNDescent object stores only the neighbor link structure,
+ * see IndexNNDescent.h for the full index object.
+ */
+
+struct VisitedTable;
+struct DistanceComputer;
+
+namespace nndescent {
+
+struct Neighbor {
+    int id;
+    float distance;
+    bool flag;
+
+    Neighbor() = default;
+    Neighbor(int id, float distance, bool f)
+            : id(id), distance(distance), flag(f) {}
+
+    inline bool operator<(const Neighbor& other) const {
+        return distance < other.distance;
+    }
+};
+
+struct Nhood {
+    std::mutex lock;
+    std::vector<Neighbor> pool; // candidate pool (a max heap)
+    int M;                      // number of new neighbors to be operated
+
+    std::vector<int> nn_old;  // old neighbors
+    std::vector<int> nn_new;  // new neighbors
+    std::vector<int> rnn_old; // reverse old neighbors
+    std::vector<int> rnn_new; // reverse new neighbors
+
+    Nhood() = default;
+
+    Nhood(int l, int s, std::mt19937& rng, int N);
+
+    Nhood& operator=(const Nhood& other);
+
+    Nhood(const Nhood& other);
+
+    void insert(int id, float dist);
+
+    template <typename C>
+    void join(C callback) const;
+};
+
+} // namespace nndescent
+
+struct NNDescent {
+    using storage_idx_t = int;
+    using idx_t = Index::idx_t;
+
+    using KNNGraph = std::vector<nndescent::Nhood>;
+
+    explicit NNDescent(const int d, const int K);
+
+    ~NNDescent();
+
+    void build(DistanceComputer& qdis, const int n, bool verbose);
+
+    void search(
+            DistanceComputer& qdis,
+            const int topk,
+            idx_t* indices,
+            float* dists,
+            VisitedTable& vt) const;
+
+    void reset();
+
+    /// Initialize the KNN graph randomly
+    void init_graph(DistanceComputer& qdis);
+
+    /// Perform NNDescent algorithm
+    void nndescent(DistanceComputer& qdis, bool verbose);
+
+    /// Perform local join on each node
+    void join(DistanceComputer& qdis);
+
+    /// Sample new neighbors for each node to peform local join later
+    void update();
+
+    /// Sample a small number of points to evaluate the quality of KNNG built
+    void generate_eval_set(
+            DistanceComputer& qdis,
+            std::vector<int>& c,
+            std::vector<std::vector<int>>& v,
+            int N);
+
+    /// Evaluate the quality of KNNG built
+    float eval_recall(
+            std::vector<int>& ctrl_points,
+            std::vector<std::vector<int>>& acc_eval_set);
+
+    bool has_built;
+
+    int K; // K in KNN graph
+    int S; // number of sample neighbors to be updated for each node
+    int R; // size of reverse links, 0 means the reverse links will not be used
+    int L; // size of the candidate pool in building
+    int iter;        // number of iterations to iterate over
+    int search_L;    // size of candidate pool in searching
+    int random_seed; // random seed for generators
+
+    int d; // dimensions
+
+    int ntotal;
+
+    KNNGraph graph;
+    std::vector<int> final_graph;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/NSG.h
+++ b/src/3rdlib/faiss/impl/NSG.h
@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include <omp.h>
+
+#include <faiss/Index.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/random.h>
+
+namespace faiss {
+
+/** Implementation of the Navigating Spreading-out Graph (NSG)
+ * datastructure.
+ *
+ * Fast Approximate Nearest Neighbor Search With The
+ * Navigating Spreading-out Graph
+ *
+ *  Cong Fu, Chao Xiang, Changxu Wang, Deng Cai, VLDB 2019
+ *
+ * This implementation is heavily influenced by the NSG
+ * implementation by ZJULearning Group
+ * (https://github.com/zjulearning/nsg)
+ *
+ * The NSG object stores only the neighbor link structure, see
+ * IndexNSG.h for the full index object.
+ */
+
+struct DistanceComputer; // from AuxIndexStructures
+struct Neighbor;
+struct Node;
+
+namespace nsg {
+
+/***********************************************************
+ * Graph structure to store a graph.
+ *
+ * It is represented by an adjacency matrix `data`, where
+ * data[i, j] is the j-th neighbor of node i.
+ ***********************************************************/
+
+template <class node_t>
+struct Graph {
+    node_t* data;    ///< the flattened adjacency matrix
+    int K;           ///< nb of neighbors per node
+    int N;           ///< total nb of nodes
+    bool own_fields; ///< the underlying data owned by itself or not
+
+    // construct from a known graph
+    Graph(node_t* data, int N, int K)
+            : data(data), K(K), N(N), own_fields(false) {}
+
+    // construct an empty graph
+    // NOTE: the newly allocated data needs to be destroyed at destruction time
+    Graph(int N, int K) : K(K), N(N), own_fields(true) {
+        data = new node_t[N * K];
+    }
+
+    // copy constructor
+    Graph(const Graph& g) : Graph(g.N, g.K) {
+        memcpy(data, g.data, N * K * sizeof(node_t));
+    }
+
+    // release the allocated memory if needed
+    ~Graph() {
+        if (own_fields) {
+            delete[] data;
+        }
+    }
+
+    // access the j-th neighbor of node i
+    inline node_t at(int i, int j) const {
+        return data[i * K + j];
+    }
+
+    // access the j-th neighbor of node i by reference
+    inline node_t& at(int i, int j) {
+        return data[i * K + j];
+    }
+};
+
+DistanceComputer* storage_distance_computer(const Index* storage);
+
+} // namespace nsg
+
+struct NSG {
+    /// internal storage of vectors (32 bits: this is expensive)
+    using storage_idx_t = int;
+
+    /// Faiss results are 64-bit
+    using idx_t = Index::idx_t;
+
+    int ntotal; ///< nb of nodes
+
+    /// construction-time parameters
+    int R; ///< nb of neighbors per node
+    int L; ///< length of the search path at construction time
+    int C; ///< candidate pool size at construction time
+
+    // search-time parameters
+    int search_L; ///< length of the search path
+
+    int enterpoint; ///< enterpoint
+
+    std::shared_ptr<nsg::Graph<int>> final_graph; ///< NSG graph structure
+
+    bool is_built; ///< NSG is built or not
+
+    RandomGenerator rng; ///< random generator
+
+    explicit NSG(int R = 32);
+
+    // build NSG from a KNN graph
+    void build(
+            Index* storage,
+            idx_t n,
+            const nsg::Graph<idx_t>& knn_graph,
+            bool verbose);
+
+    // reset the graph
+    void reset();
+
+    // search interface
+    void search(
+            DistanceComputer& dis,
+            int k,
+            idx_t* I,
+            float* D,
+            VisitedTable& vt) const;
+
+    // Compute the center point
+    void init_graph(Index* storage, const nsg::Graph<idx_t>& knn_graph);
+
+    // Search on a built graph.
+    // If collect_fullset is true, the visited nodes will be
+    // collected in `fullset`.
+    template <bool collect_fullset, class index_t>
+    void search_on_graph(
+            const nsg::Graph<index_t>& graph,
+            DistanceComputer& dis,
+            VisitedTable& vt,
+            int ep,
+            int pool_size,
+            std::vector<Neighbor>& retset,
+            std::vector<Node>& fullset) const;
+
+    // Add reverse links
+    void add_reverse_links(
+            int q,
+            std::vector<std::mutex>& locks,
+            DistanceComputer& dis,
+            nsg::Graph<Node>& graph);
+
+    void sync_prune(
+            int q,
+            std::vector<Node>& pool,
+            DistanceComputer& dis,
+            VisitedTable& vt,
+            const nsg::Graph<idx_t>& knn_graph,
+            nsg::Graph<Node>& graph);
+
+    void link(
+            Index* storage,
+            const nsg::Graph<idx_t>& knn_graph,
+            nsg::Graph<Node>& graph,
+            bool verbose);
+
+    // make NSG be fully connected
+    int tree_grow(Index* storage, std::vector<int>& degrees);
+
+    // count the size of the connected component
+    // using depth first search start by root
+    int dfs(VisitedTable& vt, int root, int cnt) const;
+
+    // attach one unlinked node
+    int attach_unlinked(
+            Index* storage,
+            VisitedTable& vt,
+            VisitedTable& vt2,
+            std::vector<int>& degrees);
+
+    // check the integrity of the NSG built
+    void check_graph() const;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/PolysemousTraining.h
+++ b/src/3rdlib/faiss/impl/PolysemousTraining.h
@ -0,0 +1,155 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
+#define FAISS_POLYSEMOUS_TRAINING_INCLUDED
+
+#include <faiss/impl/ProductQuantizer.h>
+
+namespace faiss {
+
+/// parameters used for the simulated annealing method
+struct SimulatedAnnealingParameters {
+    // optimization parameters
+    double init_temperature;  // init probability of accepting a bad swap
+    double temperature_decay; // at each iteration the temp is multiplied by
+                              // this
+    int n_iter;               // nb of iterations
+    int n_redo;               // nb of runs of the simulation
+    int seed;                 // random seed
+    int verbose;
+    bool only_bit_flips; // restrict permutation changes to bit flips
+    bool init_random;    // initialize with a random permutation (not identity)
+
+    // set reasonable defaults
+    SimulatedAnnealingParameters();
+};
+
+/// abstract class for the loss function
+struct PermutationObjective {
+    int n;
+
+    virtual double compute_cost(const int* perm) const = 0;
+
+    // what would the cost update be if iw and jw were swapped?
+    // default implementation just computes both and computes the difference
+    virtual double cost_update(const int* perm, int iw, int jw) const;
+
+    virtual ~PermutationObjective() {}
+};
+
+struct ReproduceDistancesObjective : PermutationObjective {
+    double dis_weight_factor;
+
+    static double sqr(double x) {
+        return x * x;
+    }
+
+    // weighting of distances: it is more important to reproduce small
+    // distances well
+    double dis_weight(double x) const;
+
+    std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
+    const double* target_dis;       ///< wanted distances (size n^2)
+    std::vector<double> weights;    ///< weights for each distance (size n^2)
+
+    double get_source_dis(int i, int j) const;
+
+    // cost = quadratic difference between actual distance and Hamming distance
+    double compute_cost(const int* perm) const override;
+
+    // what would the cost update be if iw and jw were swapped?
+    // computed in O(n) instead of O(n^2) for the full re-computation
+    double cost_update(const int* perm, int iw, int jw) const override;
+
+    ReproduceDistancesObjective(
+            int n,
+            const double* source_dis_in,
+            const double* target_dis_in,
+            double dis_weight_factor);
+
+    static void compute_mean_stdev(
+            const double* tab,
+            size_t n2,
+            double* mean_out,
+            double* stddev_out);
+
+    void set_affine_target_dis(const double* source_dis_in);
+
+    ~ReproduceDistancesObjective() override {}
+};
+
+struct RandomGenerator;
+
+/// Simulated annealing optimization algorithm for permutations.
+struct SimulatedAnnealingOptimizer : SimulatedAnnealingParameters {
+    PermutationObjective* obj;
+    int n;         ///< size of the permutation
+    FILE* logfile; /// logs values of the cost function
+
+    SimulatedAnnealingOptimizer(
+            PermutationObjective* obj,
+            const SimulatedAnnealingParameters& p);
+    RandomGenerator* rnd;
+
+    /// remember initial cost of optimization
+    double init_cost;
+
+    // main entry point. Perform the optimization loop, starting from
+    // and modifying permutation in-place
+    double optimize(int* perm);
+
+    // run the optimization and return the best result in best_perm
+    double run_optimization(int* best_perm);
+
+    virtual ~SimulatedAnnealingOptimizer();
+};
+
+/// optimizes the order of indices in a ProductQuantizer
+struct PolysemousTraining : SimulatedAnnealingParameters {
+    enum Optimization_type_t {
+        OT_None,
+        OT_ReproduceDistances_affine, ///< default
+        OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of
+                                 ///< y-
+    };
+    Optimization_type_t optimization_type;
+
+    /** use 1/4 of the training points for the optimization, with
+     * max. ntrain_permutation. If ntrain_permutation == 0: train on
+     * centroids */
+    int ntrain_permutation;
+    double dis_weight_factor; ///< decay of exp that weights distance loss
+
+    /// refuse to train if it would require more than that amount of RAM
+    size_t max_memory;
+
+    // filename pattern for the logging of iterations
+    std::string log_pattern;
+
+    // sets default values
+    PolysemousTraining();
+
+    /// reorder the centroids so that the Hamming distance becomes a
+    /// good approximation of the SDC distance (called by train)
+    void optimize_pq_for_hamming(ProductQuantizer& pq, size_t n, const float* x)
+            const;
+
+    /// called by optimize_pq_for_hamming
+    void optimize_ranking(ProductQuantizer& pq, size_t n, const float* x) const;
+    /// called by optimize_pq_for_hamming
+    void optimize_reproduce_distances(ProductQuantizer& pq) const;
+
+    /// make sure we don't blow up the memory
+    size_t memory_usage_per_thread(const ProductQuantizer& pq) const;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/impl/ProductQuantizer-inl.h
+++ b/src/3rdlib/faiss/impl/ProductQuantizer-inl.h
@ -0,0 +1,116 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+namespace faiss {
+
+inline PQEncoderGeneric::PQEncoderGeneric(
+        uint8_t* code,
+        int nbits,
+        uint8_t offset)
+        : code(code), offset(offset), nbits(nbits), reg(0) {
+    assert(nbits <= 64);
+    if (offset > 0) {
+        reg = (*code & ((1 << offset) - 1));
+    }
+}
+
+inline void PQEncoderGeneric::encode(uint64_t x) {
+    reg |= (uint8_t)(x << offset);
+    x >>= (8 - offset);
+    if (offset + nbits >= 8) {
+        *code++ = reg;
+
+        for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
+            *code++ = (uint8_t)x;
+            x >>= 8;
+        }
+
+        offset += nbits;
+        offset &= 7;
+        reg = (uint8_t)x;
+    } else {
+        offset += nbits;
+    }
+}
+
+inline PQEncoderGeneric::~PQEncoderGeneric() {
+    if (offset > 0) {
+        *code = reg;
+    }
+}
+
+inline PQEncoder8::PQEncoder8(uint8_t* code, int nbits) : code(code) {
+    assert(8 == nbits);
+}
+
+inline void PQEncoder8::encode(uint64_t x) {
+    *code++ = (uint8_t)x;
+}
+
+inline PQEncoder16::PQEncoder16(uint8_t* code, int nbits)
+        : code((uint16_t*)code) {
+    assert(16 == nbits);
+}
+
+inline void PQEncoder16::encode(uint64_t x) {
+    *code++ = (uint16_t)x;
+}
+
+inline PQDecoderGeneric::PQDecoderGeneric(const uint8_t* code, int nbits)
+        : code(code),
+          offset(0),
+          nbits(nbits),
+          mask((1ull << nbits) - 1),
+          reg(0) {
+    assert(nbits <= 64);
+}
+
+inline uint64_t PQDecoderGeneric::decode() {
+    if (offset == 0) {
+        reg = *code;
+    }
+    uint64_t c = (reg >> offset);
+
+    if (offset + nbits >= 8) {
+        uint64_t e = 8 - offset;
+        ++code;
+        for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
+            c |= ((uint64_t)(*code++) << e);
+            e += 8;
+        }
+
+        offset += nbits;
+        offset &= 7;
+        if (offset > 0) {
+            reg = *code;
+            c |= ((uint64_t)reg << e);
+        }
+    } else {
+        offset += nbits;
+    }
+
+    return c & mask;
+}
+
+inline PQDecoder8::PQDecoder8(const uint8_t* code, int nbits_in) : code(code) {
+    assert(8 == nbits_in);
+}
+
+inline uint64_t PQDecoder8::decode() {
+    return (uint64_t)(*code++);
+}
+
+inline PQDecoder16::PQDecoder16(const uint8_t* code, int nbits_in)
+        : code((uint16_t*)code) {
+    assert(16 == nbits_in);
+}
+
+inline uint64_t PQDecoder16::decode() {
+    return (uint64_t)(*code++);
+}
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/ProductQuantizer.h
+++ b/src/3rdlib/faiss/impl/ProductQuantizer.h
@ -0,0 +1,228 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_PRODUCT_QUANTIZER_H
+#define FAISS_PRODUCT_QUANTIZER_H
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+/** Product Quantizer. Implemented only for METRIC_L2 */
+struct ProductQuantizer {
+    using idx_t = Index::idx_t;
+
+    size_t d;     ///< size of the input vectors
+    size_t M;     ///< number of subquantizers
+    size_t nbits; ///< number of bits per quantization index
+
+    // values derived from the above
+    size_t dsub;      ///< dimensionality of each subvector
+    size_t code_size; ///< bytes per indexed vector
+    size_t ksub;      ///< number of centroids for each subquantizer
+    bool verbose;     ///< verbose during training?
+
+    /// initialization
+    enum train_type_t {
+        Train_default,
+        Train_hot_start,     ///< the centroids are already initialized
+        Train_shared,        ///< share dictionary accross PQ segments
+        Train_hypercube,     ///< intialize centroids with nbits-D hypercube
+        Train_hypercube_pca, ///< intialize centroids with nbits-D hypercube
+    };
+    train_type_t train_type;
+
+    ClusteringParameters cp; ///< parameters used during clustering
+
+    /// if non-NULL, use this index for assignment (should be of size
+    /// d / M)
+    Index* assign_index;
+
+    /// Centroid table, size M * ksub * dsub
+    std::vector<float> centroids;
+
+    /// return the centroids associated with subvector m
+    float* get_centroids(size_t m, size_t i) {
+        return &centroids[(m * ksub + i) * dsub];
+    }
+    const float* get_centroids(size_t m, size_t i) const {
+        return &centroids[(m * ksub + i) * dsub];
+    }
+
+    // Train the product quantizer on a set of points. A clustering
+    // can be set on input to define non-default clustering parameters
+    void train(int n, const float* x);
+
+    ProductQuantizer(
+            size_t d,      /* dimensionality of the input vectors */
+            size_t M,      /* number of subquantizers */
+            size_t nbits); /* number of bit per subvector index */
+
+    ProductQuantizer();
+
+    /// compute derived values when d, M and nbits have been set
+    void set_derived_values();
+
+    /// Define the centroids for subquantizer m
+    void set_params(const float* centroids, int m);
+
+    /// Quantize one vector with the product quantizer
+    void compute_code(const float* x, uint8_t* code) const;
+
+    /// same as compute_code for several vectors
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const;
+
+    /// speed up code assignment using assign_index
+    /// (non-const because the index is changed)
+    void compute_codes_with_assign_index(
+            const float* x,
+            uint8_t* codes,
+            size_t n);
+
+    /// decode a vector from a given code (or n vectors if third argument)
+    void decode(const uint8_t* code, float* x) const;
+    void decode(const uint8_t* code, float* x, size_t n) const;
+
+    /// If we happen to have the distance tables precomputed, this is
+    /// more efficient to compute the codes.
+    void compute_code_from_distance_table(const float* tab, uint8_t* code)
+            const;
+
+    /** Compute distance table for one vector.
+     *
+     * The distance table for x = [x_0 x_1 .. x_(M-1)] is a M * ksub
+     * matrix that contains
+     *
+     *   dis_table (m, j) = || x_m - c_(m, j)||^2
+     *   for m = 0..M-1 and j = 0 .. ksub - 1
+     *
+     * where c_(m, j) is the centroid no j of sub-quantizer m.
+     *
+     * @param x         input vector size d
+     * @param dis_table output table, size M * ksub
+     */
+    void compute_distance_table(const float* x, float* dis_table) const;
+
+    void compute_inner_prod_table(const float* x, float* dis_table) const;
+
+    /** compute distance table for several vectors
+     * @param nx        nb of input vectors
+     * @param x         input vector size nx * d
+     * @param dis_table output table, size nx * M * ksub
+     */
+    void compute_distance_tables(size_t nx, const float* x, float* dis_tables)
+            const;
+
+    void compute_inner_prod_tables(size_t nx, const float* x, float* dis_tables)
+            const;
+
+    /** perform a search (L2 distance)
+     * @param x        query vectors, size nx * d
+     * @param nx       nb of queries
+     * @param codes    database codes, size ncodes * code_size
+     * @param ncodes   nb of nb vectors
+     * @param res      heap array to store results (nh == nx)
+     * @param init_finalize_heap  initialize heap (input) and sort (output)?
+     */
+    void search(
+            const float* x,
+            size_t nx,
+            const uint8_t* codes,
+            const size_t ncodes,
+            float_maxheap_array_t* res,
+            bool init_finalize_heap = true) const;
+
+    /** same as search, but with inner product similarity */
+    void search_ip(
+            const float* x,
+            size_t nx,
+            const uint8_t* codes,
+            const size_t ncodes,
+            float_minheap_array_t* res,
+            bool init_finalize_heap = true) const;
+
+    /// Symmetric Distance Table
+    std::vector<float> sdc_table;
+
+    // intitialize the SDC table from the centroids
+    void compute_sdc_table();
+
+    void search_sdc(
+            const uint8_t* qcodes,
+            size_t nq,
+            const uint8_t* bcodes,
+            const size_t ncodes,
+            float_maxheap_array_t* res,
+            bool init_finalize_heap = true) const;
+};
+
+/*************************************************
+ * Objects to encode / decode strings of bits
+ *************************************************/
+
+struct PQEncoderGeneric {
+    uint8_t* code; ///< code for this vector
+    uint8_t offset;
+    const int nbits; ///< number of bits per subquantizer index
+
+    uint8_t reg;
+
+    PQEncoderGeneric(uint8_t* code, int nbits, uint8_t offset = 0);
+
+    void encode(uint64_t x);
+
+    ~PQEncoderGeneric();
+};
+
+struct PQEncoder8 {
+    uint8_t* code;
+    PQEncoder8(uint8_t* code, int nbits);
+    void encode(uint64_t x);
+};
+
+struct PQEncoder16 {
+    uint16_t* code;
+    PQEncoder16(uint8_t* code, int nbits);
+    void encode(uint64_t x);
+};
+
+struct PQDecoderGeneric {
+    const uint8_t* code;
+    uint8_t offset;
+    const int nbits;
+    const uint64_t mask;
+    uint8_t reg;
+    PQDecoderGeneric(const uint8_t* code, int nbits);
+    uint64_t decode();
+};
+
+struct PQDecoder8 {
+    static const int nbits = 8;
+    const uint8_t* code;
+    PQDecoder8(const uint8_t* code, int nbits);
+    uint64_t decode();
+};
+
+struct PQDecoder16 {
+    static const int nbits = 16;
+    const uint16_t* code;
+    PQDecoder16(const uint8_t* code, int nbits);
+    uint64_t decode();
+};
+
+} // namespace faiss
+
+#include <faiss/impl/ProductQuantizer-inl.h>
+
+#endif
--- a/src/3rdlib/faiss/impl/ResidualQuantizer.h
+++ b/src/3rdlib/faiss/impl/ResidualQuantizer.h
@ -0,0 +1,182 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/impl/AdditiveQuantizer.h>
+
+namespace faiss {
+
+/** Residual quantizer with variable number of bits per sub-quantizer
+ *
+ * The residual centroids are stored in a big cumulative centroid table.
+ * The codes are represented either as a non-compact table of size (n, M) or
+ * as the compact output (n, code_size).
+ */
+
+struct ResidualQuantizer : AdditiveQuantizer {
+    /// initialization
+    enum train_type_t {
+        Train_default,         ///< regular k-means
+        Train_progressive_dim, ///< progressive dim clustering
+    };
+
+    train_type_t train_type;
+
+    // set this bit on train_type if beam is to be trained only on the
+    // first element of the beam (faster but less accurate)
+    static const int Train_top_beam = 1024;
+
+    // set this bit to not autmatically compute the codebook tables
+    // after training
+    static const int Skip_codebook_tables = 2048;
+
+    /// beam size used for training and for encoding
+    int max_beam_size;
+
+    /// use LUT for beam search
+    int use_beam_LUT;
+
+    /// distance matrixes with beam search can get large, so use this
+    /// to batch computations at encoding time.
+    size_t max_mem_distances;
+
+    /// clustering parameters
+    ProgressiveDimClusteringParameters cp;
+
+    /// if non-NULL, use this index for assignment
+    ProgressiveDimIndexFactory* assign_index_factory;
+
+    ResidualQuantizer(
+            size_t d,
+            const std::vector<size_t>& nbits,
+            Search_type_t search_type = ST_decompress);
+
+    ResidualQuantizer(
+            size_t d,     /* dimensionality of the input vectors */
+            size_t M,     /* number of subquantizers */
+            size_t nbits, /* number of bit per subvector index */
+            Search_type_t search_type = ST_decompress);
+
+    ResidualQuantizer();
+
+    // Train the residual quantizer
+    void train(size_t n, const float* x) override;
+
+    /** Encode a set of vectors
+     *
+     * @param x      vectors to encode, size n * d
+     * @param codes  output codes, size n * code_size
+     */
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
+
+    /** lower-level encode function
+     *
+     * @param n              number of vectors to hanlde
+     * @param residuals      vectors to encode, size (n, beam_size, d)
+     * @param beam_size      input beam size
+     * @param new_beam_size  output beam size (should be <= K * beam_size)
+     * @param new_codes      output codes, size (n, new_beam_size, m + 1)
+     * @param new_residuals  output residuals, size (n, new_beam_size, d)
+     * @param new_distances  output distances, size (n, new_beam_size)
+     */
+    void refine_beam(
+            size_t n,
+            size_t beam_size,
+            const float* residuals,
+            int new_beam_size,
+            int32_t* new_codes,
+            float* new_residuals = nullptr,
+            float* new_distances = nullptr) const;
+
+    void refine_beam_LUT(
+            size_t n,
+            const float* query_norms,
+            const float* query_cp,
+            int new_beam_size,
+            int32_t* new_codes,
+            float* new_distances = nullptr) const;
+
+    /** Beam search can consume a lot of memory. This function estimates the
+     * amount of mem used by refine_beam to adjust the batch size
+     *
+     * @param beam_size  if != -1, override the beam size
+     */
+    size_t memory_per_point(int beam_size = -1) const;
+
+    /** Cross products used in codebook tables
+     *
+     * These are used to keep trak of norms of centroids.
+     */
+    void compute_codebook_tables();
+
+    /// dot products of all codebook vectors with each other
+    /// size total_codebook_size * total_codebook_size
+    std::vector<float> codebook_cross_products;
+    /// norms of all vectors
+    std::vector<float> cent_norms;
+};
+
+/** Encode a residual by sampling from a centroid table.
+ *
+ * This is a single encoding step the residual quantizer.
+ * It allows low-level access to the encoding function, exposed mainly for unit
+ * tests.
+ *
+ * @param n              number of vectors to hanlde
+ * @param residuals      vectors to encode, size (n, beam_size, d)
+ * @param cent           centroids, size (K, d)
+ * @param beam_size      input beam size
+ * @param m              size of the codes for the previous encoding steps
+ * @param codes          code array for the previous steps of the beam (n,
+ * beam_size, m)
+ * @param new_beam_size  output beam size (should be <= K * beam_size)
+ * @param new_codes      output codes, size (n, new_beam_size, m + 1)
+ * @param new_residuals  output residuals, size (n, new_beam_size, d)
+ * @param new_distances  output distances, size (n, new_beam_size)
+ * @param assign_index   if non-NULL, will be used to perform assignment
+ */
+void beam_search_encode_step(
+        size_t d,
+        size_t K,
+        const float* cent,
+        size_t n,
+        size_t beam_size,
+        const float* residuals,
+        size_t m,
+        const int32_t* codes,
+        size_t new_beam_size,
+        int32_t* new_codes,
+        float* new_residuals,
+        float* new_distances,
+        Index* assign_index = nullptr);
+
+/** Encode a set of vectors using their dot products with the codebooks
+ *
+ */
+void beam_search_encode_step_tab(
+        size_t K,
+        size_t n,
+        size_t beam_size,                  // input sizes
+        const float* codebook_cross_norms, // size K * ldc
+        size_t ldc,                        // >= K
+        const uint64_t* codebook_offsets,  // m
+        const float* query_cp,             // size n * ldqc
+        size_t ldqc,                       // >= K
+        const float* cent_norms_i,         // size K
+        size_t m,
+        const int32_t* codes,   // n * beam_size * m
+        const float* distances, // n * beam_size
+        size_t new_beam_size,
+        int32_t* new_codes,    // n * new_beam_size * (m + 1)
+        float* new_distances); // n * new_beam_size
+
+}; // namespace faiss
--- a/src/3rdlib/faiss/impl/ResultHandler.h
+++ b/src/3rdlib/faiss/impl/ResultHandler.h
@ -0,0 +1,416 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Structures that collect search results from distance computations
+ */
+
+#pragma once
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/partitioning.h>
+
+namespace faiss {
+
+/*****************************************************************
+ * Heap based result handler
+ *****************************************************************/
+
+template <class C>
+struct HeapResultHandler {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    int nq;
+    T* heap_dis_tab;
+    TI* heap_ids_tab;
+
+    int64_t k; // number of results to keep
+
+    HeapResultHandler(size_t nq, T* heap_dis_tab, TI* heap_ids_tab, size_t k)
+            : nq(nq),
+              heap_dis_tab(heap_dis_tab),
+              heap_ids_tab(heap_ids_tab),
+              k(k) {}
+
+    /******************************************************
+     * API for 1 result at a time (each SingleResultHandler is
+     * called from 1 thread)
+     */
+
+    struct SingleResultHandler {
+        HeapResultHandler& hr;
+        size_t k;
+
+        T* heap_dis;
+        TI* heap_ids;
+        T thresh;
+
+        SingleResultHandler(HeapResultHandler& hr) : hr(hr), k(hr.k) {}
+
+        /// begin results for query # i
+        void begin(size_t i) {
+            heap_dis = hr.heap_dis_tab + i * k;
+            heap_ids = hr.heap_ids_tab + i * k;
+            heap_heapify<C>(k, heap_dis, heap_ids);
+            thresh = heap_dis[0];
+        }
+
+        /// add one result for query i
+        void add_result(T dis, TI idx) {
+            if (C::cmp(heap_dis[0], dis)) {
+                heap_replace_top<C>(k, heap_dis, heap_ids, dis, idx);
+                thresh = heap_dis[0];
+            }
+        }
+
+        /// series of results for query i is done
+        void end() {
+            heap_reorder<C>(k, heap_dis, heap_ids);
+        }
+    };
+
+    /******************************************************
+     * API for multiple results (called from 1 thread)
+     */
+
+    size_t i0, i1;
+
+    /// begin
+    void begin_multiple(size_t i0, size_t i1) {
+        this->i0 = i0;
+        this->i1 = i1;
+        for (size_t i = i0; i < i1; i++) {
+            heap_heapify<C>(k, heap_dis_tab + i * k, heap_ids_tab + i * k);
+        }
+    }
+
+    /// add results for query i0..i1 and j0..j1
+    void add_results(size_t j0, size_t j1, const T* dis_tab) {
+#pragma omp parallel for
+        for (int64_t i = i0; i < i1; i++) {
+            T* heap_dis = heap_dis_tab + i * k;
+            TI* heap_ids = heap_ids_tab + i * k;
+            const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
+            T thresh = heap_dis[0];
+            for (size_t j = j0; j < j1; j++) {
+                T dis = dis_tab_i[j];
+                if (C::cmp(thresh, dis)) {
+                    heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+                    thresh = heap_dis[0];
+                }
+            }
+        }
+    }
+
+    /// series of results for queries i0..i1 is done
+    void end_multiple() {
+        // maybe parallel for
+        for (size_t i = i0; i < i1; i++) {
+            heap_reorder<C>(k, heap_dis_tab + i * k, heap_ids_tab + i * k);
+        }
+    }
+};
+
+/*****************************************************************
+ * Reservoir result handler
+ *
+ * A reservoir is a result array of size capacity > n (number of requested
+ * results) all results below a threshold are stored in an arbitrary order. When
+ * the capacity is reached, a new threshold is chosen by partitionning the
+ * distance array.
+ *****************************************************************/
+
+/// Reservoir for a single query
+template <class C>
+struct ReservoirTopN {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    T* vals;
+    TI* ids;
+
+    size_t i;        // number of stored elements
+    size_t n;        // number of requested elements
+    size_t capacity; // size of storage
+
+    T threshold; // current threshold
+
+    ReservoirTopN() {}
+
+    ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
+            : vals(vals), ids(ids), i(0), n(n), capacity(capacity) {
+        assert(n < capacity);
+        threshold = C::neutral();
+    }
+
+    void add(T val, TI id) {
+        if (C::cmp(threshold, val)) {
+            if (i == capacity) {
+                shrink_fuzzy();
+            }
+            vals[i] = val;
+            ids[i] = id;
+            i++;
+        }
+    }
+
+    // reduce storage from capacity to anything
+    // between n and (capacity + n) / 2
+    void shrink_fuzzy() {
+        assert(i == capacity);
+
+        threshold = partition_fuzzy<C>(
+                vals, ids, capacity, n, (capacity + n) / 2, &i);
+    }
+
+    void to_result(T* heap_dis, TI* heap_ids) const {
+        for (int j = 0; j < std::min(i, n); j++) {
+            heap_push<C>(j + 1, heap_dis, heap_ids, vals[j], ids[j]);
+        }
+
+        if (i < n) {
+            heap_reorder<C>(i, heap_dis, heap_ids);
+            // add empty results
+            heap_heapify<C>(n - i, heap_dis + i, heap_ids + i);
+        } else {
+            // add remaining elements
+            heap_addn<C>(n, heap_dis, heap_ids, vals + n, ids + n, i - n);
+            heap_reorder<C>(n, heap_dis, heap_ids);
+        }
+    }
+};
+
+template <class C>
+struct ReservoirResultHandler {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    int nq;
+    T* heap_dis_tab;
+    TI* heap_ids_tab;
+
+    int64_t k;       // number of results to keep
+    size_t capacity; // capacity of the reservoirs
+
+    ReservoirResultHandler(
+            size_t nq,
+            T* heap_dis_tab,
+            TI* heap_ids_tab,
+            size_t k)
+            : nq(nq),
+              heap_dis_tab(heap_dis_tab),
+              heap_ids_tab(heap_ids_tab),
+              k(k) {
+        // double then round up to multiple of 16 (for SIMD alignment)
+        capacity = (2 * k + 15) & ~15;
+    }
+
+    /******************************************************
+     * API for 1 result at a time (each SingleResultHandler is
+     * called from 1 thread)
+     */
+
+    struct SingleResultHandler {
+        ReservoirResultHandler& hr;
+
+        std::vector<T> reservoir_dis;
+        std::vector<TI> reservoir_ids;
+        ReservoirTopN<C> res1;
+
+        SingleResultHandler(ReservoirResultHandler& hr)
+                : hr(hr),
+                  reservoir_dis(hr.capacity),
+                  reservoir_ids(hr.capacity) {}
+
+        size_t i;
+
+        /// begin results for query # i
+        void begin(size_t i) {
+            res1 = ReservoirTopN<C>(
+                    hr.k,
+                    hr.capacity,
+                    reservoir_dis.data(),
+                    reservoir_ids.data());
+            this->i = i;
+        }
+
+        /// add one result for query i
+        void add_result(T dis, TI idx) {
+            res1.add(dis, idx);
+        }
+
+        /// series of results for query i is done
+        void end() {
+            T* heap_dis = hr.heap_dis_tab + i * hr.k;
+            TI* heap_ids = hr.heap_ids_tab + i * hr.k;
+            res1.to_result(heap_dis, heap_ids);
+        }
+    };
+
+    /******************************************************
+     * API for multiple results (called from 1 thread)
+     */
+
+    size_t i0, i1;
+
+    std::vector<T> reservoir_dis;
+    std::vector<TI> reservoir_ids;
+    std::vector<ReservoirTopN<C>> reservoirs;
+
+    /// begin
+    void begin_multiple(size_t i0, size_t i1) {
+        this->i0 = i0;
+        this->i1 = i1;
+        reservoir_dis.resize((i1 - i0) * capacity);
+        reservoir_ids.resize((i1 - i0) * capacity);
+        reservoirs.clear();
+        for (size_t i = i0; i < i1; i++) {
+            reservoirs.emplace_back(
+                    k,
+                    capacity,
+                    reservoir_dis.data() + (i - i0) * capacity,
+                    reservoir_ids.data() + (i - i0) * capacity);
+        }
+    }
+
+    /// add results for query i0..i1 and j0..j1
+    void add_results(size_t j0, size_t j1, const T* dis_tab) {
+        // maybe parallel for
+#pragma omp parallel for
+        for (int64_t i = i0; i < i1; i++) {
+            ReservoirTopN<C>& reservoir = reservoirs[i - i0];
+            const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
+            for (size_t j = j0; j < j1; j++) {
+                T dis = dis_tab_i[j];
+                reservoir.add(dis, j);
+            }
+        }
+    }
+
+    /// series of results for queries i0..i1 is done
+    void end_multiple() {
+        // maybe parallel for
+        for (size_t i = i0; i < i1; i++) {
+            reservoirs[i - i0].to_result(
+                    heap_dis_tab + i * k, heap_ids_tab + i * k);
+        }
+    }
+};
+
+/*****************************************************************
+ * Result handler for range searches
+ *****************************************************************/
+
+template <class C>
+struct RangeSearchResultHandler {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    RangeSearchResult* res;
+    float radius;
+
+    RangeSearchResultHandler(RangeSearchResult* res, float radius)
+            : res(res), radius(radius) {}
+
+    /******************************************************
+     * API for 1 result at a time (each SingleResultHandler is
+     * called from 1 thread)
+     ******************************************************/
+
+    struct SingleResultHandler {
+        // almost the same interface as RangeSearchResultHandler
+        RangeSearchPartialResult pres;
+        float radius;
+        RangeQueryResult* qr = nullptr;
+
+        SingleResultHandler(RangeSearchResultHandler& rh)
+                : pres(rh.res), radius(rh.radius) {}
+
+        /// begin results for query # i
+        void begin(size_t i) {
+            qr = &pres.new_result(i);
+        }
+
+        /// add one result for query i
+        void add_result(T dis, TI idx) {
+            if (C::cmp(radius, dis)) {
+                qr->add(dis, idx);
+            }
+        }
+
+        /// series of results for query i is done
+        void end() {}
+
+        ~SingleResultHandler() {
+            pres.finalize();
+        }
+    };
+
+    /******************************************************
+     * API for multiple results (called from 1 thread)
+     ******************************************************/
+
+    size_t i0, i1;
+
+    std::vector<RangeSearchPartialResult*> partial_results;
+    std::vector<size_t> j0s;
+    int pr = 0;
+
+    /// begin
+    void begin_multiple(size_t i0, size_t i1) {
+        this->i0 = i0;
+        this->i1 = i1;
+    }
+
+    /// add results for query i0..i1 and j0..j1
+
+    void add_results(size_t j0, size_t j1, const T* dis_tab) {
+        RangeSearchPartialResult* pres;
+        // there is one RangeSearchPartialResult structure per j0
+        // (= block of columns of the large distance matrix)
+        // it is a bit tricky to find the poper PartialResult structure
+        // because the inner loop is on db not on queries.
+
+        if (pr < j0s.size() && j0 == j0s[pr]) {
+            pres = partial_results[pr];
+            pr++;
+        } else if (j0 == 0 && j0s.size() > 0) {
+            pr = 0;
+            pres = partial_results[pr];
+            pr++;
+        } else { // did not find this j0
+            pres = new RangeSearchPartialResult(res);
+            partial_results.push_back(pres);
+            j0s.push_back(j0);
+            pr = partial_results.size();
+        }
+
+        for (size_t i = i0; i < i1; i++) {
+            const float* ip_line = dis_tab + (i - i0) * (j1 - j0);
+            RangeQueryResult& qres = pres->new_result(i);
+
+            for (size_t j = j0; j < j1; j++) {
+                float dis = *ip_line++;
+                if (C::cmp(radius, dis)) {
+                    qres.add(dis, j);
+                }
+            }
+        }
+    }
+
+    void end_multiple() {}
+
+    ~RangeSearchResultHandler() {
+        if (partial_results.size() > 0) {
+            RangeSearchPartialResult::merge(partial_results);
+        }
+    }
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/ScalarQuantizer.h
+++ b/src/3rdlib/faiss/impl/ScalarQuantizer.h
@ -0,0 +1,128 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+/**
+ * The uniform quantizer has a range [vmin, vmax]. The range can be
+ * the same for all dimensions (uniform) or specific per dimension
+ * (default).
+ */
+
+struct ScalarQuantizer {
+    enum QuantizerType {
+        QT_8bit,         ///< 8 bits per component
+        QT_4bit,         ///< 4 bits per component
+        QT_8bit_uniform, ///< same, shared range for all dimensions
+        QT_4bit_uniform,
+        QT_fp16,
+        QT_8bit_direct, ///< fast indexing of uint8s
+        QT_6bit,        ///< 6 bits per component
+    };
+
+    QuantizerType qtype;
+
+    /** The uniform encoder can estimate the range of representable
+     * values of the unform encoder using different statistics. Here
+     * rs = rangestat_arg */
+
+    // rangestat_arg.
+    enum RangeStat {
+        RS_minmax,    ///< [min - rs*(max-min), max + rs*(max-min)]
+        RS_meanstd,   ///< [mean - std * rs, mean + std * rs]
+        RS_quantiles, ///< [Q(rs), Q(1-rs)]
+        RS_optim,     ///< alternate optimization of reconstruction error
+    };
+
+    RangeStat rangestat;
+    float rangestat_arg;
+
+    /// dimension of input vectors
+    size_t d;
+
+    /// bits per scalar code
+    size_t bits;
+
+    /// bytes per vector
+    size_t code_size;
+
+    /// trained values (including the range)
+    std::vector<float> trained;
+
+    ScalarQuantizer(size_t d, QuantizerType qtype);
+    ScalarQuantizer();
+
+    /// updates internal values based on qtype and d
+    void set_derived_sizes();
+
+    void train(size_t n, const float* x);
+
+    /// Used by an IVF index to train based on the residuals
+    void train_residual(
+            size_t n,
+            const float* x,
+            Index* quantizer,
+            bool by_residual,
+            bool verbose);
+
+    /** Encode a set of vectors
+     *
+     * @param x      vectors to encode, size n * d
+     * @param codes  output codes, size n * code_size
+     */
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const;
+
+    /** Decode a set of vectors
+     *
+     * @param codes  codes to decode, size n * code_size
+     * @param x      output vectors, size n * d
+     */
+    void decode(const uint8_t* code, float* x, size_t n) const;
+
+    /*****************************************************
+     * Objects that provide methods for encoding/decoding, distance
+     * computation and inverted list scanning
+     *****************************************************/
+
+    struct Quantizer {
+        // encodes one vector. Assumes code is filled with 0s on input!
+        virtual void encode_vector(const float* x, uint8_t* code) const = 0;
+        virtual void decode_vector(const uint8_t* code, float* x) const = 0;
+
+        virtual ~Quantizer() {}
+    };
+
+    Quantizer* select_quantizer() const;
+
+    struct SQDistanceComputer : DistanceComputer {
+        const float* q;
+        const uint8_t* codes;
+        size_t code_size;
+
+        SQDistanceComputer() : q(nullptr), codes(nullptr), code_size(0) {}
+
+        virtual float query_to_code(const uint8_t* code) const = 0;
+    };
+
+    SQDistanceComputer* get_distance_computer(
+            MetricType metric = METRIC_L2) const;
+
+    InvertedListScanner* select_InvertedListScanner(
+            MetricType mt,
+            const Index* quantizer,
+            bool store_pairs,
+            bool by_residual = false) const;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/ThreadedIndex-inl.h
+++ b/src/3rdlib/faiss/impl/ThreadedIndex-inl.h
@ -0,0 +1,190 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/FaissAssert.h>
+#include <exception>
+#include <iostream>
+
+namespace faiss {
+
+template <typename IndexT>
+ThreadedIndex<IndexT>::ThreadedIndex(bool threaded)
+        // 0 is default dimension
+        : ThreadedIndex(0, threaded) {}
+
+template <typename IndexT>
+ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
+        : IndexT(d), own_fields(false), isThreaded_(threaded) {}
+
+template <typename IndexT>
+ThreadedIndex<IndexT>::~ThreadedIndex() {
+    for (auto& p : indices_) {
+        if (isThreaded_) {
+            // should have worker thread
+            FAISS_ASSERT((bool)p.second);
+
+            // This will also flush all pending work
+            p.second->stop();
+            p.second->waitForThreadExit();
+        } else {
+            // should not have worker thread
+            FAISS_ASSERT(!(bool)p.second);
+        }
+
+        if (own_fields) {
+            delete p.first;
+        }
+    }
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::addIndex(IndexT* index) {
+    // We inherit the dimension from the first index added to us if we don't
+    // have a set dimension
+    if (indices_.empty() && this->d == 0) {
+        this->d = index->d;
+    }
+
+    // The new index must match our set dimension
+    FAISS_THROW_IF_NOT_FMT(
+            this->d == index->d,
+            "addIndex: dimension mismatch for "
+            "newly added index; expecting dim %d, "
+            "new index has dim %d",
+            this->d,
+            index->d);
+
+    if (!indices_.empty()) {
+        auto& existing = indices_.front().first;
+
+        FAISS_THROW_IF_NOT_MSG(
+                index->metric_type == existing->metric_type,
+                "addIndex: newly added index is "
+                "of different metric type than old index");
+
+        // Make sure this index is not duplicated
+        for (auto& p : indices_) {
+            FAISS_THROW_IF_NOT_MSG(
+                    p.first != index,
+                    "addIndex: attempting to add index "
+                    "that is already in the collection");
+        }
+    }
+
+    indices_.emplace_back(std::make_pair(
+            index,
+            std::unique_ptr<WorkerThread>(
+                    isThreaded_ ? new WorkerThread : nullptr)));
+
+    onAfterAddIndex(index);
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::removeIndex(IndexT* index) {
+    for (auto it = indices_.begin(); it != indices_.end(); ++it) {
+        if (it->first == index) {
+            // This is our index; stop the worker thread before removing it,
+            // to ensure that it has finished before function exit
+            if (isThreaded_) {
+                // should have worker thread
+                FAISS_ASSERT((bool)it->second);
+                it->second->stop();
+                it->second->waitForThreadExit();
+            } else {
+                // should not have worker thread
+                FAISS_ASSERT(!(bool)it->second);
+            }
+
+            indices_.erase(it);
+            onAfterRemoveIndex(index);
+
+            if (own_fields) {
+                delete index;
+            }
+
+            return;
+        }
+    }
+
+    // could not find our index
+    FAISS_THROW_MSG("IndexReplicas::removeIndex: index not found");
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::runOnIndex(std::function<void(int, IndexT*)> f) {
+    if (isThreaded_) {
+        std::vector<std::future<bool>> v;
+
+        for (int i = 0; i < this->indices_.size(); ++i) {
+            auto& p = this->indices_[i];
+            auto indexPtr = p.first;
+            v.emplace_back(
+                    p.second->add([f, i, indexPtr]() { f(i, indexPtr); }));
+        }
+
+        waitAndHandleFutures(v);
+    } else {
+        // Multiple exceptions may be thrown; gather them as we encounter them,
+        // while letting everything else run to completion
+        std::vector<std::pair<int, std::exception_ptr>> exceptions;
+
+        for (int i = 0; i < this->indices_.size(); ++i) {
+            auto& p = this->indices_[i];
+            try {
+                f(i, p.first);
+            } catch (...) {
+                exceptions.emplace_back(
+                        std::make_pair(i, std::current_exception()));
+            }
+        }
+
+        handleExceptions(exceptions);
+    }
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::runOnIndex(
+        std::function<void(int, const IndexT*)> f) const {
+    const_cast<ThreadedIndex<IndexT>*>(this)->runOnIndex(
+            [f](int i, IndexT* idx) { f(i, idx); });
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::reset() {
+    runOnIndex([](int, IndexT* index) { index->reset(); });
+    this->ntotal = 0;
+    this->is_trained = false;
+}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::onAfterAddIndex(IndexT* index) {}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::onAfterRemoveIndex(IndexT* index) {}
+
+template <typename IndexT>
+void ThreadedIndex<IndexT>::waitAndHandleFutures(
+        std::vector<std::future<bool>>& v) {
+    // Blocking wait for completion for all of the indices, capturing any
+    // exceptions that are generated
+    std::vector<std::pair<int, std::exception_ptr>> exceptions;
+
+    for (int i = 0; i < v.size(); ++i) {
+        auto& fut = v[i];
+
+        try {
+            fut.get();
+        } catch (...) {
+            exceptions.emplace_back(
+                    std::make_pair(i, std::current_exception()));
+        }
+    }
+
+    handleExceptions(exceptions);
+}
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/ThreadedIndex.h
+++ b/src/3rdlib/faiss/impl/ThreadedIndex.h
@ -0,0 +1,86 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/utils/WorkerThread.h>
+#include <memory>
+#include <vector>
+
+namespace faiss {
+
+/// A holder of indices in a collection of threads
+/// The interface to this class itself is not thread safe
+template <typename IndexT>
+class ThreadedIndex : public IndexT {
+   public:
+    explicit ThreadedIndex(bool threaded);
+    explicit ThreadedIndex(int d, bool threaded);
+
+    ~ThreadedIndex() override;
+
+    /// override an index that is managed by ourselves.
+    /// WARNING: once an index is added, it becomes unsafe to touch it from any
+    /// other thread than that on which is managing it, until we are shut
+    /// down. Use runOnIndex to perform work on it instead.
+    void addIndex(IndexT* index);
+
+    /// Remove an index that is managed by ourselves.
+    /// This will flush all pending work on that index, and then shut
+    /// down its managing thread, and will remove the index.
+    void removeIndex(IndexT* index);
+
+    /// Run a function on all indices, in the thread that the index is
+    /// managed in.
+    /// Function arguments are (index in collection, index pointer)
+    void runOnIndex(std::function<void(int, IndexT*)> f);
+    void runOnIndex(std::function<void(int, const IndexT*)> f) const;
+
+    /// faiss::Index API
+    /// All indices receive the same call
+    void reset() override;
+
+    /// Returns the number of sub-indices
+    int count() const {
+        return indices_.size();
+    }
+
+    /// Returns the i-th sub-index
+    IndexT* at(int i) {
+        return indices_[i].first;
+    }
+
+    /// Returns the i-th sub-index (const version)
+    const IndexT* at(int i) const {
+        return indices_[i].first;
+    }
+
+    /// Whether or not we are responsible for deleting our contained indices
+    bool own_fields;
+
+   protected:
+    /// Called just after an index is added
+    virtual void onAfterAddIndex(IndexT* index);
+
+    /// Called just after an index is removed
+    virtual void onAfterRemoveIndex(IndexT* index);
+
+   protected:
+    static void waitAndHandleFutures(std::vector<std::future<bool>>& v);
+
+    /// Collection of Index instances, with their managing worker thread if any
+    std::vector<std::pair<IndexT*, std::unique_ptr<WorkerThread>>> indices_;
+
+    /// Is this index multi-threaded?
+    bool isThreaded_;
+};
+
+} // namespace faiss
+
+#include <faiss/impl/ThreadedIndex-inl.h>
--- a/src/3rdlib/faiss/impl/io.h
+++ b/src/3rdlib/faiss/impl/io.h
@ -0,0 +1,145 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/***********************************************************
+ * Abstract I/O objects
+ *
+ * I/O is always sequential, seek does not need to be supported
+ * (indexes could be read or written to a pipe).
+ ***********************************************************/
+
+#pragma once
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+struct IOReader {
+    // name that can be used in error messages
+    std::string name;
+
+    // fread. Returns number of items read or 0 in case of EOF.
+    virtual size_t operator()(void* ptr, size_t size, size_t nitems) = 0;
+
+    // return a file number that can be memory-mapped
+    virtual int fileno();
+
+    virtual ~IOReader() {}
+};
+
+struct IOWriter {
+    // name that can be used in error messages
+    std::string name;
+
+    // fwrite. Return number of items written
+    virtual size_t operator()(const void* ptr, size_t size, size_t nitems) = 0;
+
+    // return a file number that can be memory-mapped
+    virtual int fileno();
+
+    virtual ~IOWriter() noexcept(false) {}
+};
+
+struct VectorIOReader : IOReader {
+    std::vector<uint8_t> data;
+    size_t rp = 0;
+    size_t operator()(void* ptr, size_t size, size_t nitems) override;
+};
+
+struct VectorIOWriter : IOWriter {
+    std::vector<uint8_t> data;
+    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
+};
+
+struct FileIOReader : IOReader {
+    FILE* f = nullptr;
+    bool need_close = false;
+
+    FileIOReader(FILE* rf);
+
+    FileIOReader(const char* fname);
+
+    ~FileIOReader() override;
+
+    size_t operator()(void* ptr, size_t size, size_t nitems) override;
+
+    int fileno() override;
+};
+
+struct FileIOWriter : IOWriter {
+    FILE* f = nullptr;
+    bool need_close = false;
+
+    FileIOWriter(FILE* wf);
+
+    FileIOWriter(const char* fname);
+
+    ~FileIOWriter() override;
+
+    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
+
+    int fileno() override;
+};
+
+/*******************************************************
+ * Buffered reader + writer
+ *
+ * They attempt to read and write only buffers of size bsz to the
+ * underlying reader or writer. This is done by splitting or merging
+ * the read/write functions.
+ *******************************************************/
+
+/** wraps an ioreader to make buffered reads to avoid too small reads */
+struct BufferedIOReader : IOReader {
+    IOReader* reader;
+    size_t bsz;
+    size_t ofs;    ///< offset in input stream
+    size_t ofs2;   ///< number of bytes returned to caller
+    size_t b0, b1; ///< range of available bytes in the buffer
+    std::vector<char> buffer;
+
+    /**
+     * @param bsz    buffer size (bytes). Reads will be done by batched of
+     *               this size
+     */
+    explicit BufferedIOReader(IOReader* reader, size_t bsz = 1024 * 1024);
+
+    size_t operator()(void* ptr, size_t size, size_t nitems) override;
+};
+
+struct BufferedIOWriter : IOWriter {
+    IOWriter* writer;
+    size_t bsz;
+    size_t ofs;
+    size_t ofs2; ///< number of bytes received from caller
+    size_t b0;   ///< amount of data in buffer
+    std::vector<char> buffer;
+
+    explicit BufferedIOWriter(IOWriter* writer, size_t bsz = 1024 * 1024);
+
+    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
+
+    // flushes
+    ~BufferedIOWriter() override;
+};
+
+/// cast a 4-character string to a uint32_t that can be written and read easily
+uint32_t fourcc(const char sx[4]);
+uint32_t fourcc(const std::string& sx);
+
+// decoding of fourcc (int32 -> string)
+void fourcc_inv(uint32_t x, char str[5]);
+std::string fourcc_inv(uint32_t x);
+std::string fourcc_inv_printable(uint32_t x);
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/io_macros.h
+++ b/src/3rdlib/faiss/impl/io_macros.h
@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/*************************************************************
+ * I/O macros
+ *
+ * we use macros so that we have a line number to report in abort
+ * (). This makes debugging a lot easier. The IOReader or IOWriter is
+ * always called f and thus is not passed in as a macro parameter.
+ **************************************************************/
+
+#define READANDCHECK(ptr, n)                         \
+    {                                                \
+        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);   \
+        FAISS_THROW_IF_NOT_FMT(                      \
+                ret == (n),                          \
+                "read error in %s: %zd != %zd (%s)", \
+                f->name.c_str(),                     \
+                ret,                                 \
+                size_t(n),                           \
+                strerror(errno));                    \
+    }
+
+#define READ1(x) READANDCHECK(&(x), 1)
+
+// will fail if we write 256G of data at once...
+#define READVECTOR(vec)                                              \
+    {                                                                \
+        size_t size;                                                 \
+        READANDCHECK(&size, 1);                                      \
+        FAISS_THROW_IF_NOT(size >= 0 && size < (uint64_t{1} << 40)); \
+        (vec).resize(size);                                          \
+        READANDCHECK((vec).data(), size);                            \
+    }
+
+#define READSTRING(s)                     \
+    {                                     \
+        size_t size = (s).size();         \
+        WRITEANDCHECK(&size, 1);          \
+        WRITEANDCHECK((s).c_str(), size); \
+    }
+
+#define WRITEANDCHECK(ptr, n)                         \
+    {                                                 \
+        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);    \
+        FAISS_THROW_IF_NOT_FMT(                       \
+                ret == (n),                           \
+                "write error in %s: %zd != %zd (%s)", \
+                f->name.c_str(),                      \
+                ret,                                  \
+                size_t(n),                            \
+                strerror(errno));                     \
+    }
+
+#define WRITE1(x) WRITEANDCHECK(&(x), 1)
+
+#define WRITEVECTOR(vec)                   \
+    {                                      \
+        size_t size = (vec).size();        \
+        WRITEANDCHECK(&size, 1);           \
+        WRITEANDCHECK((vec).data(), size); \
+    }
--- a/src/3rdlib/faiss/impl/lattice_Zn.h
+++ b/src/3rdlib/faiss/impl/lattice_Zn.h
@ -0,0 +1,188 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+#ifndef FAISS_LATTICE_ZN_H
+#define FAISS_LATTICE_ZN_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace faiss {
+
+/** returns the nearest vertex in the sphere to a query. Returns only
+ * the coordinates, not an id.
+ *
+ * Algorithm: all points are derived from a one atom vector up to a
+ * permutation and sign changes. The search function finds the most
+ * appropriate atom and transformation.
+ */
+struct ZnSphereSearch {
+    int dimS, r2;
+    int natom;
+
+    /// size dim * ntatom
+    std::vector<float> voc;
+
+    ZnSphereSearch(int dim, int r2);
+
+    /// find nearest centroid. x does not need to be normalized
+    float search(const float* x, float* c) const;
+
+    /// full call. Requires externally-allocated temp space
+    float search(
+            const float* x,
+            float* c,
+            float* tmp,   // size 2 *dim
+            int* tmp_int, // size dim
+            int* ibest_out = nullptr) const;
+
+    // multi-threaded
+    void search_multi(int n, const float* x, float* c_out, float* dp_out);
+};
+
+/***************************************************************************
+ * Support ids as well.
+ *
+ * Limitations: ids are limited to 64 bit
+ ***************************************************************************/
+
+struct EnumeratedVectors {
+    /// size of the collection
+    uint64_t nv;
+    int dim;
+
+    explicit EnumeratedVectors(int dim) : nv(0), dim(dim) {}
+
+    /// encode a vector from a collection
+    virtual uint64_t encode(const float* x) const = 0;
+
+    /// decode it
+    virtual void decode(uint64_t code, float* c) const = 0;
+
+    // call encode on nc vectors
+    void encode_multi(size_t nc, const float* c, uint64_t* codes) const;
+
+    // call decode on nc codes
+    void decode_multi(size_t nc, const uint64_t* codes, float* c) const;
+
+    // find the nearest neighbor of each xq
+    // (decodes and computes distances)
+    void find_nn(
+            size_t n,
+            const uint64_t* codes,
+            size_t nq,
+            const float* xq,
+            int64_t* idx,
+            float* dis);
+
+    virtual ~EnumeratedVectors() {}
+};
+
+struct Repeat {
+    float val;
+    int n;
+};
+
+/** Repeats: used to encode a vector that has n occurrences of
+ *  val. Encodes the signs and permutation of the vector. Useful for
+ *  atoms.
+ */
+struct Repeats {
+    int dim;
+    std::vector<Repeat> repeats;
+
+    // initialize from a template of the atom.
+    Repeats(int dim = 0, const float* c = nullptr);
+
+    // count number of possible codes for this atom
+    uint64_t count() const;
+
+    uint64_t encode(const float* c) const;
+
+    void decode(uint64_t code, float* c) const;
+};
+
+/** codec that can return ids for the encoded vectors
+ *
+ * uses the ZnSphereSearch to encode the vector by encoding the
+ * permutation and signs. Depends on ZnSphereSearch because it uses
+ * the atom numbers */
+struct ZnSphereCodec : ZnSphereSearch, EnumeratedVectors {
+    struct CodeSegment : Repeats {
+        explicit CodeSegment(const Repeats& r) : Repeats(r) {}
+        uint64_t c0; // first code assigned to segment
+        int signbits;
+    };
+
+    std::vector<CodeSegment> code_segments;
+    uint64_t nv;
+    size_t code_size;
+
+    ZnSphereCodec(int dim, int r2);
+
+    uint64_t search_and_encode(const float* x) const;
+
+    void decode(uint64_t code, float* c) const override;
+
+    /// takes vectors that do not need to be centroids
+    uint64_t encode(const float* x) const override;
+};
+
+/** recursive sphere codec
+ *
+ * Uses a recursive decomposition on the dimensions to encode
+ * centroids found by the ZnSphereSearch. The codes are *not*
+ * compatible with the ones of ZnSpehreCodec
+ */
+struct ZnSphereCodecRec : EnumeratedVectors {
+    int r2;
+
+    int log2_dim;
+    int code_size;
+
+    ZnSphereCodecRec(int dim, int r2);
+
+    uint64_t encode_centroid(const float* c) const;
+
+    void decode(uint64_t code, float* c) const override;
+
+    /// vectors need to be centroids (does not work on arbitrary
+    /// vectors)
+    uint64_t encode(const float* x) const override;
+
+    std::vector<uint64_t> all_nv;
+    std::vector<uint64_t> all_nv_cum;
+
+    int decode_cache_ld;
+    std::vector<std::vector<float>> decode_cache;
+
+    // nb of vectors in the sphere in dim 2^ld with r2 radius
+    uint64_t get_nv(int ld, int r2a) const;
+
+    // cumulative version
+    uint64_t get_nv_cum(int ld, int r2t, int r2a) const;
+    void set_nv_cum(int ld, int r2t, int r2a, uint64_t v);
+};
+
+/** Codec that uses the recursive codec if dim is a power of 2 and
+ * the regular one otherwise */
+struct ZnSphereCodecAlt : ZnSphereCodec {
+    bool use_rec;
+    ZnSphereCodecRec znc_rec;
+
+    ZnSphereCodecAlt(int dim, int r2);
+
+    uint64_t encode(const float* x) const override;
+
+    void decode(uint64_t code, float* c) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/impl/platform_macros.h
+++ b/src/3rdlib/faiss/impl/platform_macros.h
@ -0,0 +1,89 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef _MSC_VER
+
+/*******************************************************
+ * Windows specific macros
+ *******************************************************/
+
+#ifdef FAISS_MAIN_LIB
+#define FAISS_API __declspec(dllexport)
+#else // _FAISS_MAIN_LIB
+#define FAISS_API __declspec(dllimport)
+#endif // FAISS_MAIN_LIB
+
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+
+#define posix_memalign(p, a, s) \
+    (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#define posix_memalign_free _aligned_free
+
+// aligned should be in front of the declaration
+#define ALIGNED(x) __declspec(align(x))
+
+// redefine the GCC intrinsics with Windows equivalents
+
+#include <intrin.h>
+
+inline int __builtin_ctzll(uint64_t x) {
+    unsigned long ret;
+    _BitScanForward64(&ret, x);
+    return (int)ret;
+}
+
+// cudatoolkit provides __builtin_ctz for NVCC >= 11.0
+#if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
+inline int __builtin_ctz(unsigned long x) {
+    unsigned long ret;
+    _BitScanForward(&ret, x);
+    return (int)ret;
+}
+#endif
+
+inline int __builtin_clzll(uint64_t x) {
+    return (int)__lzcnt64(x);
+}
+
+#define __builtin_popcount __popcnt
+#define __builtin_popcountl __popcnt64
+
+// MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
+// processors cf.
+// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+#ifdef __AVX__
+#define __SSE__ 1
+#define __SSE2__ 1
+#define __SSE3__ 1
+#define __SSE4_1__ 1
+#define __SSE4_2__ 1
+#endif
+
+// MSVC sets FMA and F16C automatically when using AVX2
+// Ref. FMA (under /arch:AVX2):
+// https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64 Ref. F16C (2nd
+// paragraph): https://walbourn.github.io/directxmath-avx2/
+#ifdef __AVX2__
+#define __FMA__ 1
+#define __F16C__ 1
+#endif
+
+#else
+/*******************************************************
+ * Linux and OSX
+ *******************************************************/
+
+#define FAISS_API
+#define posix_memalign_free free
+
+// aligned should be *in front* of the declaration, for compatibility with
+// windows
+#define ALIGNED(x) __attribute__((aligned(x)))
+
+#endif // _MSC_VER
--- a/src/3rdlib/faiss/impl/pq4_fast_scan.h
+++ b/src/3rdlib/faiss/impl/pq4_fast_scan.h
@ -0,0 +1,160 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+
+/** PQ4 SIMD packing and accumulation functions
+ *
+ * The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
+ * and produces an output matrix for that. It is interesting for nq * nb <= 4,
+ * otherwise register spilling becomes too large.
+ *
+ * The implementation of these functions is spread over 3 cpp files to reduce
+ * parallel compile times. Templates are instanciated explicitly.
+ */
+
+namespace faiss {
+
+/** Pack codes for consumption by the SIMD kernels.
+ *  The unused bytes are set to 0.
+ *
+ * @param codes   input codes, size (ntotal, ceil(M / 2))
+ * @param nototal number of input codes
+ * @param nb      output number of codes (ntotal rounded up to a multiple of
+ *                bbs)
+ * @param M2      number of sub-quantizers (=M rounded up to a muliple of 2)
+ * @param bbs     size of database blocks (multiple of 32)
+ * @param blocks  output array, size nb * nsq / 2.
+ */
+void pq4_pack_codes(
+        const uint8_t* codes,
+        size_t ntotal,
+        size_t M,
+        size_t nb,
+        size_t bbs,
+        size_t M2,
+        uint8_t* blocks);
+
+/** Same as pack_codes but write in a given range of the output,
+ * leaving the rest untouched. Assumes allocated entries are 0 on input.
+ *
+ * @param codes   input codes, size (i1 - i0, ceil(M / 2))
+ * @param i0      first output code to write
+ * @param i1      last output code to write
+ * @param blocks  output array, size at least ceil(i1 / bbs) * bbs * nsq / 2
+ */
+void pq4_pack_codes_range(
+        const uint8_t* codes,
+        size_t M,
+        size_t i0,
+        size_t i1,
+        size_t bbs,
+        size_t M2,
+        uint8_t* blocks);
+
+/** get a single element from a packed codes table
+ *
+ * @param i        vector id
+ * @param sq       subquantizer (< nsq)
+ */
+uint8_t pq4_get_packed_element(
+        const uint8_t* data,
+        size_t bbs,
+        size_t nsq,
+        size_t i,
+        size_t sq);
+
+/** Pack Look-up table for consumption by the kernel.
+ *
+ * @param nq      number of queries
+ * @param nsq     number of sub-quantizers (muliple of 2)
+ * @param src     input array, size (nq, 16)
+ * @param dest    output array, size (nq, 16)
+ */
+void pq4_pack_LUT(int nq, int nsq, const uint8_t* src, uint8_t* dest);
+
+/** Loop over database elements and accumulate results into result handler
+ *
+ * @param nq      number of queries
+ * @param nb      number of database elements
+ * @param bbs     size of database blocks (multiple of 32)
+ * @param nsq     number of sub-quantizers (muliple of 2)
+ * @param codes   packed codes array
+ * @param LUT     packed look-up table
+ */
+template <class ResultHandler>
+void pq4_accumulate_loop(
+        int nq,
+        size_t nb,
+        int bbs,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res);
+
+/* qbs versions, supported only for bbs=32.
+ *
+ * The kernel function runs the kernel for *several* query blocks
+ * and bbs database vectors. The sizes of the blocks are encoded in qbs as
+ * base-16 digits.
+ *
+ * For example, qbs = 0x1223 means that the kernel will be run 4 times, the
+ * first time with 3 query vectors, second time with 2 query vectors, then 2
+ * vectors again and finally with 1 query vector. The output block will thus be
+ * nq = 3 + 2 + 2 + 1 = 6 queries. For a given total block size, the optimal
+ * decomposition into sub-blocks (measured empirically) is given by
+ * preferred_qbs().
+ */
+
+/* compute the number of queries from a base-16 decomposition */
+int pq4_qbs_to_nq(int qbs);
+
+/** return the preferred decomposition in blocks for a nb of queries. */
+int pq4_preferred_qbs(int nq);
+
+/** Pack Look-up table for consumption by the kernel.
+ *
+ * @param qbs     4-bit encoded number of query blocks, the total number of
+ *                queries handled (nq) is deduced from it
+ * @param nsq     number of sub-quantizers (muliple of 2)
+ * @param src     input array, size (nq, 16)
+ * @param dest    output array, size (nq, 16)
+ * @return nq
+ */
+int pq4_pack_LUT_qbs(int fqbs, int nsq, const uint8_t* src, uint8_t* dest);
+
+/** Same as pq4_pack_LUT_qbs, except the source vectors are remapped with q_map
+ */
+int pq4_pack_LUT_qbs_q_map(
+        int qbs,
+        int nsq,
+        const uint8_t* src,
+        const int* q_map,
+        uint8_t* dest);
+
+/** Run accumulation loop.
+ *
+ * @param qbs     4-bit encded number of queries
+ * @param nb      number of database codes (mutliple of bbs)
+ * @param nsq     number of sub-quantizers
+ * @param codes   encoded database vectors (packed)
+ * @param LUT     look-up table (packed)
+ * @param res     call-back for the resutls
+ */
+template <class ResultHandler>
+void pq4_accumulate_loop_qbs(
+        int qbs,
+        size_t nb,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res);
+
+} // namespace faiss
--- a/src/3rdlib/faiss/impl/simd_result_handlers.h
+++ b/src/3rdlib/faiss/impl/simd_result_handlers.h
@ -0,0 +1,531 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <type_traits>
+#include <vector>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/simdlib.h>
+
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/AlignedTable.h>
+#include <faiss/utils/partitioning.h>
+
+/** This file contains callbacks for kernels that compute distances.
+ *
+ * The SIMDResultHandler object is intended to be templated and inlined.
+ * Methods:
+ * - handle(): called when 32 distances are computed and provided in two
+ *   simd16uint16. (q, b) indicate which entry it is in the block.
+ * - set_block_origin(): set the sub-matrix that is being computed
+ */
+
+namespace faiss {
+
+namespace simd_result_handlers {
+
+/** Dummy structure that just computes a checksum on results
+ * (to avoid the computation to be optimized away) */
+struct DummyResultHandler {
+    size_t cs = 0;
+
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+        cs += q * 123 + b * 789 + d0.get_scalar_0() + d1.get_scalar_0();
+    }
+
+    void set_block_origin(size_t, size_t) {}
+};
+
+/** memorize results in a nq-by-nb matrix.
+ *
+ * j0 is the current upper-left block of the matrix
+ */
+struct StoreResultHandler {
+    uint16_t* data;
+    size_t ld; // total number of columns
+    size_t i0 = 0;
+    size_t j0 = 0;
+
+    StoreResultHandler(uint16_t* data, size_t ld) : data(data), ld(ld) {}
+
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+        size_t ofs = (q + i0) * ld + j0 + b * 32;
+        d0.store(data + ofs);
+        d1.store(data + ofs + 16);
+    }
+
+    void set_block_origin(size_t i0, size_t j0) {
+        this->i0 = i0;
+        this->j0 = j0;
+    }
+};
+
+/** stores results in fixed-size matrix. */
+template <int NQ, int BB>
+struct FixedStorageHandler {
+    simd16uint16 dis[NQ][BB];
+    int i0 = 0;
+
+    void handle(int q, int b, simd16uint16 d0, simd16uint16 d1) {
+        dis[q + i0][2 * b] = d0;
+        dis[q + i0][2 * b + 1] = d1;
+    }
+
+    void set_block_origin(size_t i0, size_t j0) {
+        this->i0 = i0;
+        assert(j0 == 0);
+    }
+
+    template <class OtherResultHandler>
+    void to_other_handler(OtherResultHandler& other) const {
+        for (int q = 0; q < NQ; q++) {
+            for (int b = 0; b < BB; b += 2) {
+                other.handle(q, b / 2, dis[q][b], dis[q][b + 1]);
+            }
+        }
+    }
+};
+
+/** Record origin of current block  */
+template <class C, bool with_id_map>
+struct SIMDResultHandler {
+    using TI = typename C::TI;
+
+    bool disable = false;
+
+    int64_t i0 = 0; // query origin
+    int64_t j0 = 0; // db origin
+    size_t ntotal;  // ignore excess elements after ntotal
+
+    /// these fields are used mainly for the IVF variants (with_id_map=true)
+    const TI* id_map;      // map offset in invlist to vector id
+    const int* q_map;      // map q to global query
+    const uint16_t* dbias; // table of biases to add to each query
+
+    explicit SIMDResultHandler(size_t ntotal)
+            : ntotal(ntotal), id_map(nullptr), q_map(nullptr), dbias(nullptr) {}
+
+    void set_block_origin(size_t i0, size_t j0) {
+        this->i0 = i0;
+        this->j0 = j0;
+    }
+
+    // adjust handler data for IVF.
+    void adjust_with_origin(size_t& q, simd16uint16& d0, simd16uint16& d1) {
+        q += i0;
+
+        if (dbias) {
+            simd16uint16 dbias16(dbias[q]);
+            d0 += dbias16;
+            d1 += dbias16;
+        }
+
+        if (with_id_map) { // FIXME test on q_map instead
+            q = q_map[q];
+        }
+    }
+
+    // compute and adjust idx
+    int64_t adjust_id(size_t b, size_t j) {
+        int64_t idx = j0 + 32 * b + j;
+        if (with_id_map) {
+            idx = id_map[idx];
+        }
+        return idx;
+    }
+
+    /// return binary mask of elements below thr in (d0, d1)
+    /// inverse_test returns elements above
+    uint32_t get_lt_mask(
+            uint16_t thr,
+            size_t b,
+            simd16uint16 d0,
+            simd16uint16 d1) {
+        simd16uint16 thr16(thr);
+        uint32_t lt_mask;
+
+        constexpr bool keep_min = C::is_max;
+        if (keep_min) {
+            lt_mask = ~cmp_ge32(d0, d1, thr16);
+        } else {
+            lt_mask = ~cmp_le32(d0, d1, thr16);
+        }
+
+        if (lt_mask == 0) {
+            return 0;
+        }
+        uint64_t idx = j0 + b * 32;
+        if (idx + 32 > ntotal) {
+            if (idx >= ntotal) {
+                return 0;
+            }
+            int nbit = (ntotal - idx);
+            lt_mask &= (uint32_t(1) << nbit) - 1;
+        }
+        return lt_mask;
+    }
+
+    virtual void to_flat_arrays(
+            float* distances,
+            int64_t* labels,
+            const float* normalizers = nullptr) = 0;
+
+    virtual ~SIMDResultHandler() {}
+};
+
+/** Special version for k=1 */
+template <class C, bool with_id_map = false>
+struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    struct Result {
+        T val;
+        TI id;
+    };
+    std::vector<Result> results;
+
+    SingleResultHandler(size_t nq, size_t ntotal)
+            : SIMDResultHandler<C, with_id_map>(ntotal), results(nq) {
+        for (int i = 0; i < nq; i++) {
+            Result res = {C::neutral(), -1};
+            results[i] = res;
+        }
+    }
+
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+        if (this->disable) {
+            return;
+        }
+
+        this->adjust_with_origin(q, d0, d1);
+
+        Result& res = results[q];
+        uint32_t lt_mask = this->get_lt_mask(res.val, b, d0, d1);
+        if (!lt_mask) {
+            return;
+        }
+
+        ALIGNED(32) uint16_t d32tab[32];
+        d0.store(d32tab);
+        d1.store(d32tab + 16);
+
+        while (lt_mask) {
+            // find first non-zero
+            int j = __builtin_ctz(lt_mask);
+            lt_mask -= 1 << j;
+            T dis = d32tab[j];
+            if (C::cmp(res.val, dis)) {
+                res.val = dis;
+                res.id = this->adjust_id(b, j);
+            }
+        }
+    }
+
+    void to_flat_arrays(
+            float* distances,
+            int64_t* labels,
+            const float* normalizers = nullptr) override {
+        for (int q = 0; q < results.size(); q++) {
+            if (!normalizers) {
+                distances[q] = results[q].val;
+            } else {
+                float one_a = 1 / normalizers[2 * q];
+                float b = normalizers[2 * q + 1];
+                distances[q] = b + results[q].val * one_a;
+            }
+            labels[q] = results[q].id;
+        }
+    }
+};
+
+/** Structure that collects results in a min- or max-heap */
+template <class C, bool with_id_map = false>
+struct HeapHandler : SIMDResultHandler<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    int nq;
+    T* heap_dis_tab;
+    TI* heap_ids_tab;
+
+    int64_t k; // number of results to keep
+
+    HeapHandler(
+            int nq,
+            T* heap_dis_tab,
+            TI* heap_ids_tab,
+            size_t k,
+            size_t ntotal)
+            : SIMDResultHandler<C, with_id_map>(ntotal),
+              nq(nq),
+              heap_dis_tab(heap_dis_tab),
+              heap_ids_tab(heap_ids_tab),
+              k(k) {
+        for (int q = 0; q < nq; q++) {
+            T* heap_dis_in = heap_dis_tab + q * k;
+            TI* heap_ids_in = heap_ids_tab + q * k;
+            heap_heapify<C>(k, heap_dis_in, heap_ids_in);
+        }
+    }
+
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+        if (this->disable) {
+            return;
+        }
+
+        this->adjust_with_origin(q, d0, d1);
+
+        T* heap_dis = heap_dis_tab + q * k;
+        TI* heap_ids = heap_ids_tab + q * k;
+
+        uint16_t cur_thresh =
+                heap_dis[0] < 65536 ? (uint16_t)(heap_dis[0]) : 0xffff;
+
+        // here we handle the reverse comparison case as well
+        uint32_t lt_mask = this->get_lt_mask(cur_thresh, b, d0, d1);
+
+        if (!lt_mask) {
+            return;
+        }
+
+        ALIGNED(32) uint16_t d32tab[32];
+        d0.store(d32tab);
+        d1.store(d32tab + 16);
+
+        while (lt_mask) {
+            // find first non-zero
+            int j = __builtin_ctz(lt_mask);
+            lt_mask -= 1 << j;
+            T dis = d32tab[j];
+            if (C::cmp(heap_dis[0], dis)) {
+                int64_t idx = this->adjust_id(b, j);
+                heap_pop<C>(k, heap_dis, heap_ids);
+                heap_push<C>(k, heap_dis, heap_ids, dis, idx);
+            }
+        }
+    }
+
+    void to_flat_arrays(
+            float* distances,
+            int64_t* labels,
+            const float* normalizers = nullptr) override {
+        for (int q = 0; q < nq; q++) {
+            T* heap_dis_in = heap_dis_tab + q * k;
+            TI* heap_ids_in = heap_ids_tab + q * k;
+            heap_reorder<C>(k, heap_dis_in, heap_ids_in);
+            int64_t* heap_ids = labels + q * k;
+            float* heap_dis = distances + q * k;
+
+            float one_a = 1.0, b = 0.0;
+            if (normalizers) {
+                one_a = 1 / normalizers[2 * q];
+                b = normalizers[2 * q + 1];
+            }
+            for (int j = 0; j < k; j++) {
+                heap_ids[j] = heap_ids_in[j];
+                heap_dis[j] = heap_dis_in[j] * one_a + b;
+            }
+        }
+    }
+};
+
+/** Simple top-N implementation using a reservoir.
+ *
+ * Results are stored when they are below the threshold until the capacity is
+ * reached. Then a partition sort is used to update the threshold. */
+
+namespace {
+
+uint64_t get_cy() {
+#ifdef MICRO_BENCHMARK
+    uint32_t high, low;
+    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
+    return ((uint64_t)high << 32) | (low);
+#else
+    return 0;
+#endif
+}
+
+} // anonymous namespace
+
+template <class C>
+struct ReservoirTopN {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    T* vals;
+    TI* ids;
+
+    size_t i;        // number of stored elements
+    size_t n;        // number of requested elements
+    size_t capacity; // size of storage
+    size_t cycles = 0;
+
+    T threshold; // current threshold
+
+    ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
+            : vals(vals), ids(ids), i(0), n(n), capacity(capacity) {
+        assert(n < capacity);
+        threshold = C::neutral();
+    }
+
+    void add(T val, TI id) {
+        if (C::cmp(threshold, val)) {
+            if (i == capacity) {
+                shrink_fuzzy();
+            }
+            vals[i] = val;
+            ids[i] = id;
+            i++;
+        }
+    }
+
+    /// shrink number of stored elements to n
+    void shrink_xx() {
+        uint64_t t0 = get_cy();
+        qselect(vals, ids, i, n);
+        i = n; // forget all elements above i = n
+        threshold = C::Crev::neutral();
+        for (size_t j = 0; j < n; j++) {
+            if (C::cmp(vals[j], threshold)) {
+                threshold = vals[j];
+            }
+        }
+        cycles += get_cy() - t0;
+    }
+
+    void shrink() {
+        uint64_t t0 = get_cy();
+        threshold = partition<C>(vals, ids, i, n);
+        i = n;
+        cycles += get_cy() - t0;
+    }
+
+    void shrink_fuzzy() {
+        uint64_t t0 = get_cy();
+        assert(i == capacity);
+        threshold = partition_fuzzy<C>(
+                vals, ids, capacity, n, (capacity + n) / 2, &i);
+        cycles += get_cy() - t0;
+    }
+};
+
+/** Handler built from several ReservoirTopN (one per query) */
+template <class C, bool with_id_map = false>
+struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    size_t capacity; // rounded up to multiple of 16
+    std::vector<TI> all_ids;
+    AlignedTable<T> all_vals;
+
+    std::vector<ReservoirTopN<C>> reservoirs;
+
+    uint64_t times[4];
+
+    ReservoirHandler(size_t nq, size_t ntotal, size_t n, size_t capacity_in)
+            : SIMDResultHandler<C, with_id_map>(ntotal),
+              capacity((capacity_in + 15) & ~15),
+              all_ids(nq * capacity),
+              all_vals(nq * capacity) {
+        assert(capacity % 16 == 0);
+        for (size_t i = 0; i < nq; i++) {
+            reservoirs.emplace_back(
+                    n,
+                    capacity,
+                    all_vals.get() + i * capacity,
+                    all_ids.data() + i * capacity);
+        }
+        times[0] = times[1] = times[2] = times[3] = 0;
+    }
+
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+        uint64_t t0 = get_cy();
+        if (this->disable) {
+            return;
+        }
+        this->adjust_with_origin(q, d0, d1);
+
+        ReservoirTopN<C>& res = reservoirs[q];
+        uint32_t lt_mask = this->get_lt_mask(res.threshold, b, d0, d1);
+        uint64_t t1 = get_cy();
+        times[0] += t1 - t0;
+
+        if (!lt_mask) {
+            return;
+        }
+        ALIGNED(32) uint16_t d32tab[32];
+        d0.store(d32tab);
+        d1.store(d32tab + 16);
+
+        while (lt_mask) {
+            // find first non-zero
+            int j = __builtin_ctz(lt_mask);
+            lt_mask -= 1 << j;
+            T dis = d32tab[j];
+            res.add(dis, this->adjust_id(b, j));
+        }
+        times[1] += get_cy() - t1;
+    }
+
+    void to_flat_arrays(
+            float* distances,
+            int64_t* labels,
+            const float* normalizers = nullptr) override {
+        using Cf = typename std::conditional<
+                C::is_max,
+                CMax<float, int64_t>,
+                CMin<float, int64_t>>::type;
+
+        uint64_t t0 = get_cy();
+        uint64_t t3 = 0;
+        std::vector<int> perm(reservoirs[0].n);
+        for (int q = 0; q < reservoirs.size(); q++) {
+            ReservoirTopN<C>& res = reservoirs[q];
+            size_t n = res.n;
+
+            if (res.i > res.n) {
+                res.shrink();
+            }
+            int64_t* heap_ids = labels + q * n;
+            float* heap_dis = distances + q * n;
+
+            float one_a = 1.0, b = 0.0;
+            if (normalizers) {
+                one_a = 1 / normalizers[2 * q];
+                b = normalizers[2 * q + 1];
+            }
+            for (int i = 0; i < res.i; i++) {
+                perm[i] = i;
+            }
+            // indirect sort of result arrays
+            std::sort(perm.begin(), perm.begin() + res.i, [&res](int i, int j) {
+                return C::cmp(res.vals[j], res.vals[i]);
+            });
+            for (int i = 0; i < res.i; i++) {
+                heap_dis[i] = res.vals[perm[i]] * one_a + b;
+                heap_ids[i] = res.ids[perm[i]];
+            }
+
+            // possibly add empty results
+            heap_heapify<Cf>(n - res.i, heap_dis + res.i, heap_ids + res.i);
+
+            t3 += res.cycles;
+        }
+        times[2] += get_cy() - t0;
+        times[3] += t3;
+    }
+};
+
+} // namespace simd_result_handlers
+
+} // namespace faiss
--- a/src/3rdlib/faiss/index_factory.h
+++ b/src/3rdlib/faiss/index_factory.h
@ -0,0 +1,24 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+/** Build and index with the sequence of processing steps described in
+ *  the string. */
+Index* index_factory(
+        int d,
+        const char* description,
+        MetricType metric = METRIC_L2);
+
+IndexBinary* index_binary_factory(int d, const char* description);
+
+} // namespace faiss
--- a/src/3rdlib/faiss/index_io.h
+++ b/src/3rdlib/faiss/index_io.h
@ -0,0 +1,79 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// I/O code for indexes
+
+#ifndef FAISS_INDEX_IO_H
+#define FAISS_INDEX_IO_H
+
+#include <cstdio>
+#include <string>
+#include <typeinfo>
+#include <vector>
+
+/** I/O functions can read/write to a filename, a file handle or to an
+ * object that abstracts the medium.
+ *
+ * The read functions return objects that should be deallocated with
+ * delete. All references within these objectes are owned by the
+ * object.
+ */
+
+namespace faiss {
+
+struct Index;
+struct IndexBinary;
+struct VectorTransform;
+struct ProductQuantizer;
+struct IOReader;
+struct IOWriter;
+struct InvertedLists;
+
+void write_index(const Index* idx, const char* fname);
+void write_index(const Index* idx, FILE* f);
+void write_index(const Index* idx, IOWriter* writer);
+
+void write_index_binary(const IndexBinary* idx, const char* fname);
+void write_index_binary(const IndexBinary* idx, FILE* f);
+void write_index_binary(const IndexBinary* idx, IOWriter* writer);
+
+// The read_index flags are implemented only for a subset of index types.
+const int IO_FLAG_READ_ONLY = 2;
+// strip directory component from ondisk filename, and assume it's in
+// the same directory as the index file
+const int IO_FLAG_ONDISK_SAME_DIR = 4;
+// don't load IVF data to RAM, only list sizes
+const int IO_FLAG_SKIP_IVF_DATA = 8;
+// try to memmap data (useful to load an ArrayInvertedLists as an
+// OnDiskInvertedLists)
+const int IO_FLAG_MMAP = IO_FLAG_SKIP_IVF_DATA | 0x646f0000;
+
+Index* read_index(const char* fname, int io_flags = 0);
+Index* read_index(FILE* f, int io_flags = 0);
+Index* read_index(IOReader* reader, int io_flags = 0);
+
+IndexBinary* read_index_binary(const char* fname, int io_flags = 0);
+IndexBinary* read_index_binary(FILE* f, int io_flags = 0);
+IndexBinary* read_index_binary(IOReader* reader, int io_flags = 0);
+
+void write_VectorTransform(const VectorTransform* vt, const char* fname);
+VectorTransform* read_VectorTransform(const char* fname);
+
+ProductQuantizer* read_ProductQuantizer(const char* fname);
+ProductQuantizer* read_ProductQuantizer(IOReader* reader);
+
+void write_ProductQuantizer(const ProductQuantizer* pq, const char* fname);
+void write_ProductQuantizer(const ProductQuantizer* pq, IOWriter* f);
+
+void write_InvertedLists(const InvertedLists* ils, IOWriter* f);
+InvertedLists* read_InvertedLists(IOReader* reader, int io_flags = 0);
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/invlists/BlockInvertedLists.h
+++ b/src/3rdlib/faiss/invlists/BlockInvertedLists.h
@ -0,0 +1,74 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/index_io.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/invlists/InvertedListsIOHook.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+/** Inverted Lists that are organized by blocks.
+ *
+ * Different from the regular inverted lists, the codes are organized by blocks
+ * of size block_size bytes that reprsent a set of n_per_block. Therefore, code
+ * allocations are always rounded up to block_size bytes. The codes are also
+ * aligned on 32-byte boundaries for use with SIMD.
+ *
+ * To avoid misinterpretations, the code_size is set to (size_t)(-1), even if
+ * arguably the amount of memory consumed by code is block_size / n_per_block.
+ *
+ * The writing functions add_entries and update_entries operate on block-aligned
+ * data.
+ */
+struct BlockInvertedLists : InvertedLists {
+    size_t n_per_block; // nb of vectors stored per block
+    size_t block_size;  // nb bytes per block
+
+    std::vector<AlignedTable<uint8_t>> codes;
+    std::vector<std::vector<idx_t>> ids;
+
+    BlockInvertedLists(size_t nlist, size_t vec_per_block, size_t block_size);
+
+    BlockInvertedLists();
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    // works only on empty BlockInvertedLists
+    // the codes should be of size ceil(n_entry / n_per_block) * block_size
+    // and padded with 0s
+    size_t add_entries(
+            size_t list_no,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    /// not implemented
+    void update_entries(
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    // also pads new data with 0s
+    void resize(size_t list_no, size_t new_size) override;
+
+    ~BlockInvertedLists() override;
+};
+
+struct BlockInvertedListsIOHook : InvertedListsIOHook {
+    BlockInvertedListsIOHook();
+    void write(const InvertedLists* ils, IOWriter* f) const override;
+    InvertedLists* read(IOReader* f, int io_flags) const override;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/invlists/DirectMap.h
+++ b/src/3rdlib/faiss/invlists/DirectMap.h
@ -0,0 +1,116 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_DIRECT_MAP_H
+#define FAISS_DIRECT_MAP_H
+
+#include <faiss/invlists/InvertedLists.h>
+#include <unordered_map>
+
+namespace faiss {
+
+// When offsets list id + offset are encoded in an uint64
+// we call this LO = list-offset
+
+inline uint64_t lo_build(uint64_t list_id, uint64_t offset) {
+    return list_id << 32 | offset;
+}
+
+inline uint64_t lo_listno(uint64_t lo) {
+    return lo >> 32;
+}
+
+inline uint64_t lo_offset(uint64_t lo) {
+    return lo & 0xffffffff;
+}
+
+/**
+ * Direct map: a way to map back from ids to inverted lists
+ */
+struct DirectMap {
+    typedef Index::idx_t idx_t;
+
+    enum Type {
+        NoMap = 0,    // default
+        Array = 1,    // sequential ids (only for add, no add_with_ids)
+        Hashtable = 2 // arbitrary ids
+    };
+    Type type;
+
+    /// map for direct access to the elements. Map ids to LO-encoded entries.
+    std::vector<idx_t> array;
+    std::unordered_map<idx_t, idx_t> hashtable;
+
+    DirectMap();
+
+    /// set type and initialize
+    void set_type(Type new_type, const InvertedLists* invlists, size_t ntotal);
+
+    /// get an entry
+    idx_t get(idx_t id) const;
+
+    /// for quick checks
+    bool no() const {
+        return type == NoMap;
+    }
+
+    /**
+     * update the direct_map
+     */
+
+    /// throw if Array and ids is not NULL
+    void check_can_add(const idx_t* ids);
+
+    /// non thread-safe version
+    void add_single_id(idx_t id, idx_t list_no, size_t offset);
+
+    /// remove all entries
+    void clear();
+
+    /**
+     * operations on inverted lists that require translation with a DirectMap
+     */
+
+    /// remove ids from the InvertedLists, possibly using the direct map
+    size_t remove_ids(const IDSelector& sel, InvertedLists* invlists);
+
+    /// update entries, using the direct map
+    void update_codes(
+            InvertedLists* invlists,
+            int n,
+            const idx_t* ids,
+            const idx_t* list_nos,
+            const uint8_t* codes);
+};
+
+/// Thread-safe way of updating the direct_map
+struct DirectMapAdd {
+    typedef Index::idx_t idx_t;
+
+    using Type = DirectMap::Type;
+
+    DirectMap& direct_map;
+    DirectMap::Type type;
+    size_t ntotal;
+    size_t n;
+    const idx_t* xids;
+
+    std::vector<idx_t> all_ofs;
+
+    DirectMapAdd(DirectMap& direct_map, size_t n, const idx_t* xids);
+
+    /// add vector i (with id xids[i]) at list_no and offset
+    void add(size_t i, idx_t list_no, size_t offset);
+
+    ~DirectMapAdd();
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/invlists/InvertedLists.h
+++ b/src/3rdlib/faiss/invlists/InvertedLists.h
@ -0,0 +1,366 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INVERTEDLISTS_IVF_H
+#define FAISS_INVERTEDLISTS_IVF_H
+
+/**
+ * Definition of inverted lists + a few common classes that implement
+ * the interface.
+ */
+
+#include <faiss/Index.h>
+#include <vector>
+
+namespace faiss {
+
+/** Table of inverted lists
+ * multithreading rules:
+ * - concurrent read accesses are allowed
+ * - concurrent update accesses are allowed
+ * - for resize and add_entries, only concurrent access to different lists
+ *   are allowed
+ */
+struct InvertedLists {
+    typedef Index::idx_t idx_t;
+
+    size_t nlist;     ///< number of possible key values
+    size_t code_size; ///< code size per vector in bytes
+
+    InvertedLists(size_t nlist, size_t code_size);
+
+    /// used for BlockInvertedLists, where the codes are packed into groups
+    /// and the individual code size is meaningless
+    static const size_t INVALID_CODE_SIZE = static_cast<size_t>(-1);
+
+    /*************************
+     *  Read only functions */
+
+    /// get the size of a list
+    virtual size_t list_size(size_t list_no) const = 0;
+
+    /** get the codes for an inverted list
+     * must be released by release_codes
+     *
+     * @return codes    size list_size * code_size
+     */
+    virtual const uint8_t* get_codes(size_t list_no) const = 0;
+
+    /** get the ids for an inverted list
+     * must be released by release_ids
+     *
+     * @return ids      size list_size
+     */
+    virtual const idx_t* get_ids(size_t list_no) const = 0;
+
+    /// release codes returned by get_codes (default implementation is nop
+    virtual void release_codes(size_t list_no, const uint8_t* codes) const;
+
+    /// release ids returned by get_ids
+    virtual void release_ids(size_t list_no, const idx_t* ids) const;
+
+    /// @return a single id in an inverted list
+    virtual idx_t get_single_id(size_t list_no, size_t offset) const;
+
+    /// @return a single code in an inverted list
+    /// (should be deallocated with release_codes)
+    virtual const uint8_t* get_single_code(size_t list_no, size_t offset) const;
+
+    /// prepare the following lists (default does nothing)
+    /// a list can be -1 hence the signed long
+    virtual void prefetch_lists(const idx_t* list_nos, int nlist) const;
+
+    /*************************
+     * writing functions     */
+
+    /// add one entry to an inverted list
+    virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t* code);
+
+    virtual size_t add_entries(
+            size_t list_no,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) = 0;
+
+    virtual void update_entry(
+            size_t list_no,
+            size_t offset,
+            idx_t id,
+            const uint8_t* code);
+
+    virtual void update_entries(
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) = 0;
+
+    virtual void resize(size_t list_no, size_t new_size) = 0;
+
+    virtual void reset();
+
+    /// move all entries from oivf (empty on output)
+    void merge_from(InvertedLists* oivf, size_t add_id);
+
+    virtual ~InvertedLists();
+
+    /*************************
+     * statistics            */
+
+    /// 1= perfectly balanced, >1: imbalanced
+    double imbalance_factor() const;
+
+    /// display some stats about the inverted lists
+    void print_stats() const;
+
+    /// sum up list sizes
+    size_t compute_ntotal() const;
+
+    /**************************************
+     * Scoped inverted lists (for automatic deallocation)
+     *
+     * instead of writing:
+     *
+     *     uint8_t * codes = invlists->get_codes (10);
+     *     ... use codes
+     *     invlists->release_codes(10, codes)
+     *
+     * write:
+     *
+     *    ScopedCodes codes (invlists, 10);
+     *    ... use codes.get()
+     *    // release called automatically when codes goes out of scope
+     *
+     * the following function call also works:
+     *
+     *    foo (123, ScopedCodes (invlists, 10).get(), 456);
+     *
+     */
+
+    struct ScopedIds {
+        const InvertedLists* il;
+        const idx_t* ids;
+        size_t list_no;
+
+        ScopedIds(const InvertedLists* il, size_t list_no)
+                : il(il), ids(il->get_ids(list_no)), list_no(list_no) {}
+
+        const idx_t* get() {
+            return ids;
+        }
+
+        idx_t operator[](size_t i) const {
+            return ids[i];
+        }
+
+        ~ScopedIds() {
+            il->release_ids(list_no, ids);
+        }
+    };
+
+    struct ScopedCodes {
+        const InvertedLists* il;
+        const uint8_t* codes;
+        size_t list_no;
+
+        ScopedCodes(const InvertedLists* il, size_t list_no)
+                : il(il), codes(il->get_codes(list_no)), list_no(list_no) {}
+
+        ScopedCodes(const InvertedLists* il, size_t list_no, size_t offset)
+                : il(il),
+                  codes(il->get_single_code(list_no, offset)),
+                  list_no(list_no) {}
+
+        const uint8_t* get() {
+            return codes;
+        }
+
+        ~ScopedCodes() {
+            il->release_codes(list_no, codes);
+        }
+    };
+};
+
+/// simple (default) implementation as an array of inverted lists
+struct ArrayInvertedLists : InvertedLists {
+    std::vector<std::vector<uint8_t>> codes; // binary codes, size nlist
+    std::vector<std::vector<idx_t>> ids;     ///< Inverted lists for indexes
+
+    ArrayInvertedLists(size_t nlist, size_t code_size);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    size_t add_entries(
+            size_t list_no,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    void update_entries(
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    void resize(size_t list_no, size_t new_size) override;
+
+    ~ArrayInvertedLists() override;
+};
+
+/*****************************************************************
+ * Meta-inverted lists
+ *
+ * About terminology: the inverted lists are seen as a sparse matrix,
+ * that can be stacked horizontally, vertically and sliced.
+ *****************************************************************/
+
+/// invlists that fail for all write functions
+struct ReadOnlyInvertedLists : InvertedLists {
+    ReadOnlyInvertedLists(size_t nlist, size_t code_size)
+            : InvertedLists(nlist, code_size) {}
+
+    size_t add_entries(
+            size_t list_no,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    void update_entries(
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    void resize(size_t list_no, size_t new_size) override;
+};
+
+/// Horizontal stack of inverted lists
+struct HStackInvertedLists : ReadOnlyInvertedLists {
+    std::vector<const InvertedLists*> ils;
+
+    /// build InvertedLists by concatenating nil of them
+    HStackInvertedLists(int nil, const InvertedLists** ils);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+
+    void release_codes(size_t list_no, const uint8_t* codes) const override;
+    void release_ids(size_t list_no, const idx_t* ids) const override;
+
+    idx_t get_single_id(size_t list_no, size_t offset) const override;
+
+    const uint8_t* get_single_code(size_t list_no, size_t offset)
+            const override;
+};
+
+using ConcatenatedInvertedLists = HStackInvertedLists;
+
+/// vertical slice of indexes in another InvertedLists
+struct SliceInvertedLists : ReadOnlyInvertedLists {
+    const InvertedLists* il;
+    idx_t i0, i1;
+
+    SliceInvertedLists(const InvertedLists* il, idx_t i0, idx_t i1);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    void release_codes(size_t list_no, const uint8_t* codes) const override;
+    void release_ids(size_t list_no, const idx_t* ids) const override;
+
+    idx_t get_single_id(size_t list_no, size_t offset) const override;
+
+    const uint8_t* get_single_code(size_t list_no, size_t offset)
+            const override;
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+};
+
+struct VStackInvertedLists : ReadOnlyInvertedLists {
+    std::vector<const InvertedLists*> ils;
+    std::vector<idx_t> cumsz;
+
+    /// build InvertedLists by concatenating nil of them
+    VStackInvertedLists(int nil, const InvertedLists** ils);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    void release_codes(size_t list_no, const uint8_t* codes) const override;
+    void release_ids(size_t list_no, const idx_t* ids) const override;
+
+    idx_t get_single_id(size_t list_no, size_t offset) const override;
+
+    const uint8_t* get_single_code(size_t list_no, size_t offset)
+            const override;
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+};
+
+/** use the first inverted lists if they are non-empty otherwise use the second
+ *
+ * This is useful if il1 has a few inverted lists that are too long,
+ * and that il0 has replacement lists for those, with empty lists for
+ * the others. */
+struct MaskedInvertedLists : ReadOnlyInvertedLists {
+    const InvertedLists* il0;
+    const InvertedLists* il1;
+
+    MaskedInvertedLists(const InvertedLists* il0, const InvertedLists* il1);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    void release_codes(size_t list_no, const uint8_t* codes) const override;
+    void release_ids(size_t list_no, const idx_t* ids) const override;
+
+    idx_t get_single_id(size_t list_no, size_t offset) const override;
+
+    const uint8_t* get_single_code(size_t list_no, size_t offset)
+            const override;
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+};
+
+/** if the inverted list in il is smaller than maxsize then return it,
+ *  otherwise return an empty invlist */
+struct StopWordsInvertedLists : ReadOnlyInvertedLists {
+    const InvertedLists* il0;
+    size_t maxsize;
+
+    StopWordsInvertedLists(const InvertedLists* il, size_t maxsize);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    void release_codes(size_t list_no, const uint8_t* codes) const override;
+    void release_ids(size_t list_no, const idx_t* ids) const override;
+
+    idx_t get_single_id(size_t list_no, size_t offset) const override;
+
+    const uint8_t* get_single_code(size_t list_no, size_t offset)
+            const override;
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/invlists/InvertedListsIOHook.h
+++ b/src/3rdlib/faiss/invlists/InvertedListsIOHook.h
@ -0,0 +1,62 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/io.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <string>
+
+namespace faiss {
+
+/** Callbacks to handle other types of InvertedList objects.
+ *
+ * The callbacks should be registered with add_callback before calling
+ * read_index or read_InvertedLists. The callbacks for
+ * OnDiskInvertedLists are registrered by default. The invlist type is
+ * identified by:
+ *
+ * - the key (a fourcc) at read time
+ * - the class name (as given by typeid.name) at write time
+ */
+struct InvertedListsIOHook {
+    const std::string key;       ///< string version of the fourcc
+    const std::string classname; ///< typeid.name
+
+    InvertedListsIOHook(const std::string& key, const std::string& classname);
+
+    /// write the index to the IOWriter (including the fourcc)
+    virtual void write(const InvertedLists* ils, IOWriter* f) const = 0;
+
+    /// called when the fourcc matches this class's fourcc
+    virtual InvertedLists* read(IOReader* f, int io_flags) const = 0;
+
+    /** read from a ArrayInvertedLists into this invertedlist type.
+     * For this to work, the callback has to be enabled and the io_flag has to
+     * be set to IO_FLAG_SKIP_IVF_DATA | (16 upper bits of the fourcc)
+     *
+     * (default implementation fails)
+     */
+    virtual InvertedLists* read_ArrayInvertedLists(
+            IOReader* f,
+            int io_flags,
+            size_t nlist,
+            size_t code_size,
+            const std::vector<size_t>& sizes) const;
+
+    virtual ~InvertedListsIOHook() {}
+
+    /**************************** Manage the set of callbacks ******/
+
+    // transfers ownership
+    static void add_callback(InvertedListsIOHook*);
+    static void print_callbacks();
+    static InvertedListsIOHook* lookup(int h);
+    static InvertedListsIOHook* lookup_classname(const std::string& classname);
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/invlists/OnDiskInvertedLists.h
+++ b/src/3rdlib/faiss/invlists/OnDiskInvertedLists.h
@ -0,0 +1,155 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_ON_DISK_INVERTED_LISTS_H
+#define FAISS_ON_DISK_INVERTED_LISTS_H
+
+#include <list>
+#include <typeinfo>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/index_io.h>
+#include <faiss/invlists/InvertedListsIOHook.h>
+
+namespace faiss {
+
+struct LockLevels;
+
+struct OnDiskOneList {
+    size_t size;     // size of inverted list (entries)
+    size_t capacity; // allocated size (entries)
+    size_t offset;   // offset in buffer (bytes)
+    OnDiskOneList();
+};
+
+/** On-disk storage of inverted lists.
+ *
+ * The data is stored in a mmapped chunk of memory (base ptointer ptr,
+ * size totsize). Each list is a range of memory that contains (object
+ * List) that contains:
+ *
+ * - uint8_t codes[capacity * code_size]
+ * - followed by idx_t ids[capacity]
+ *
+ * in each of the arrays, the size <= capacity first elements are
+ * used, the rest is not initialized.
+ *
+ * Addition and resize are supported by:
+ * - roundind up the capacity of the lists to a power of two
+ * - maintaining a list of empty slots, sorted by size.
+ * - resizing the mmapped block is adjusted as needed.
+ *
+ * An OnDiskInvertedLists is compact if the size == capacity for all
+ * lists and there are no available slots.
+ *
+ * Addition to the invlists is slow. For incremental add it is better
+ * to use a default ArrayInvertedLists object and convert it to an
+ * OnDisk with merge_from.
+ *
+ * When it is known that a set of lists will be accessed, it is useful
+ * to call prefetch_lists, that launches a set of threads to read the
+ * lists in parallel.
+ */
+struct OnDiskInvertedLists : InvertedLists {
+    using List = OnDiskOneList;
+
+    // size nlist
+    std::vector<List> lists;
+
+    struct Slot {
+        size_t offset;   // bytes
+        size_t capacity; // bytes
+        Slot(size_t offset, size_t capacity);
+        Slot();
+    };
+
+    // size whatever space remains
+    std::list<Slot> slots;
+
+    std::string filename;
+    size_t totsize;
+    uint8_t* ptr;   // mmap base pointer
+    bool read_only; /// are inverted lists mapped read-only
+
+    OnDiskInvertedLists(size_t nlist, size_t code_size, const char* filename);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+
+    size_t add_entries(
+            size_t list_no,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    void update_entries(
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const idx_t* ids,
+            const uint8_t* code) override;
+
+    void resize(size_t list_no, size_t new_size) override;
+
+    // copy all inverted lists into *this, in compact form (without
+    // allocating slots)
+    size_t merge_from(
+            const InvertedLists** ils,
+            int n_il,
+            bool verbose = false);
+
+    /// same as merge_from for a single invlist
+    size_t merge_from_1(const InvertedLists* il, bool verbose = false);
+
+    /// restrict the inverted lists to l0:l1 without touching the mmapped region
+    void crop_invlists(size_t l0, size_t l1);
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+
+    ~OnDiskInvertedLists() override;
+
+    // private
+
+    LockLevels* locks;
+
+    // encapsulates the threads that are busy prefeteching
+    struct OngoingPrefetch;
+    OngoingPrefetch* pf;
+    int prefetch_nthread;
+
+    void do_mmap();
+    void update_totsize(size_t new_totsize);
+    void resize_locked(size_t list_no, size_t new_size);
+    size_t allocate_slot(size_t capacity);
+    void free_slot(size_t offset, size_t capacity);
+
+    /// override all list sizes and make a packed storage
+    void set_all_lists_sizes(const size_t* sizes);
+
+    // empty constructor for the I/O functions
+    OnDiskInvertedLists();
+};
+
+struct OnDiskInvertedListsIOHook : InvertedListsIOHook {
+    OnDiskInvertedListsIOHook();
+    void write(const InvertedLists* ils, IOWriter* f) const override;
+    InvertedLists* read(IOReader* f, int io_flags) const override;
+    InvertedLists* read_ArrayInvertedLists(
+            IOReader* f,
+            int io_flags,
+            size_t nlist,
+            size_t code_size,
+            const std::vector<size_t>& sizes) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/src/3rdlib/faiss/utils/AlignedTable.h
+++ b/src/3rdlib/faiss/utils/AlignedTable.h
@ -0,0 +1,176 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+template <int A = 32>
+inline bool is_aligned_pointer(const void* x) {
+    size_t xi = (size_t)x;
+    return xi % A == 0;
+}
+
+// class that manages suitably aligned arrays for SIMD
+// T should be a POV type. The default alignment is 32 for AVX
+template <class T, int A = 32>
+struct AlignedTableTightAlloc {
+    T* ptr;
+    size_t numel;
+
+    AlignedTableTightAlloc() : ptr(nullptr), numel(0) {}
+
+    explicit AlignedTableTightAlloc(size_t n) : ptr(nullptr), numel(0) {
+        resize(n);
+    }
+
+    size_t itemsize() const {
+        return sizeof(T);
+    }
+
+    void resize(size_t n) {
+        if (numel == n) {
+            return;
+        }
+        T* new_ptr;
+        if (n > 0) {
+            int ret = posix_memalign((void**)&new_ptr, A, n * sizeof(T));
+            if (ret != 0) {
+                throw std::bad_alloc();
+            }
+            if (numel > 0) {
+                memcpy(new_ptr, ptr, sizeof(T) * std::min(numel, n));
+            }
+        } else {
+            new_ptr = nullptr;
+        }
+        numel = n;
+        posix_memalign_free(ptr);
+        ptr = new_ptr;
+    }
+
+    void clear() {
+        memset(ptr, 0, nbytes());
+    }
+    size_t size() const {
+        return numel;
+    }
+    size_t nbytes() const {
+        return numel * sizeof(T);
+    }
+
+    T* get() {
+        return ptr;
+    }
+    const T* get() const {
+        return ptr;
+    }
+    T* data() {
+        return ptr;
+    }
+    const T* data() const {
+        return ptr;
+    }
+    T& operator[](size_t i) {
+        return ptr[i];
+    }
+    T operator[](size_t i) const {
+        return ptr[i];
+    }
+
+    ~AlignedTableTightAlloc() {
+        posix_memalign_free(ptr);
+    }
+
+    AlignedTableTightAlloc<T, A>& operator=(
+            const AlignedTableTightAlloc<T, A>& other) {
+        resize(other.numel);
+        memcpy(ptr, other.ptr, sizeof(T) * numel);
+        return *this;
+    }
+
+    AlignedTableTightAlloc(const AlignedTableTightAlloc<T, A>& other)
+            : ptr(nullptr), numel(0) {
+        *this = other;
+    }
+};
+
+// same as AlignedTableTightAlloc, but with geometric re-allocation
+template <class T, int A = 32>
+struct AlignedTable {
+    AlignedTableTightAlloc<T, A> tab;
+    size_t numel = 0;
+
+    static size_t round_capacity(size_t n) {
+        if (n == 0) {
+            return 0;
+        }
+        if (n < 8 * A) {
+            return 8 * A;
+        }
+        size_t capacity = 8 * A;
+        while (capacity < n) {
+            capacity *= 2;
+        }
+        return capacity;
+    }
+
+    AlignedTable() {}
+
+    explicit AlignedTable(size_t n) : tab(round_capacity(n)), numel(n) {}
+
+    size_t itemsize() const {
+        return sizeof(T);
+    }
+
+    void resize(size_t n) {
+        tab.resize(round_capacity(n));
+        numel = n;
+    }
+
+    void clear() {
+        tab.clear();
+    }
+    size_t size() const {
+        return numel;
+    }
+    size_t nbytes() const {
+        return numel * sizeof(T);
+    }
+
+    T* get() {
+        return tab.get();
+    }
+    const T* get() const {
+        return tab.get();
+    }
+    T* data() {
+        return tab.get();
+    }
+    const T* data() const {
+        return tab.get();
+    }
+    T& operator[](size_t i) {
+        return tab.ptr[i];
+    }
+    T operator[](size_t i) const {
+        return tab.ptr[i];
+    }
+
+    // assign and copy constructor should work as expected
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/Heap.h
+++ b/src/3rdlib/faiss/utils/Heap.h
@ -0,0 +1,481 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * C++ support for heaps. The set of functions is tailored for efficient
+ * similarity search.
+ *
+ * There is no specific object for a heap, and the functions that operate on a
+ * single heap are inlined, because heaps are often small. More complex
+ * functions are implemented in Heaps.cpp
+ *
+ * All heap functions rely on a C template class that define the type of the
+ * keys and values and their ordering (increasing with CMax and decreasing with
+ * Cmin). The C types are defined in ordered_key_value.h
+ */
+
+#ifndef FAISS_Heap_h
+#define FAISS_Heap_h
+
+#include <climits>
+#include <cmath>
+#include <cstring>
+
+#include <stdint.h>
+#include <cassert>
+#include <cstdio>
+
+#include <limits>
+
+#include <faiss/utils/ordered_key_value.h>
+
+namespace faiss {
+
+/*******************************************************************
+ * Basic heap ops: push and pop
+ *******************************************************************/
+
+/** Pops the top element from the heap defined by bh_val[0..k-1] and
+ * bh_ids[0..k-1].  on output the element at k-1 is undefined.
+ */
+template <class C>
+inline void heap_pop(size_t k, typename C::T* bh_val, typename C::TI* bh_ids) {
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    typename C::T val = bh_val[k];
+    size_t i = 1, i1, i2;
+    while (1) {
+        i1 = i << 1;
+        i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
+            if (C::cmp(val, bh_val[i1]))
+                break;
+            bh_val[i] = bh_val[i1];
+            bh_ids[i] = bh_ids[i1];
+            i = i1;
+        } else {
+            if (C::cmp(val, bh_val[i2]))
+                break;
+            bh_val[i] = bh_val[i2];
+            bh_ids[i] = bh_ids[i2];
+            i = i2;
+        }
+    }
+    bh_val[i] = bh_val[k];
+    bh_ids[i] = bh_ids[k];
+}
+
+/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
+ * bh_ids[0..k-2].  on output the element at k-1 is defined.
+ */
+template <class C>
+inline void heap_push(
+        size_t k,
+        typename C::T* bh_val,
+        typename C::TI* bh_ids,
+        typename C::T val,
+        typename C::TI ids) {
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    size_t i = k, i_father;
+    while (i > 1) {
+        i_father = i >> 1;
+        if (!C::cmp(val, bh_val[i_father])) /* the heap structure is ok */
+            break;
+        bh_val[i] = bh_val[i_father];
+        bh_ids[i] = bh_ids[i_father];
+        i = i_father;
+    }
+    bh_val[i] = val;
+    bh_ids[i] = ids;
+}
+
+/** Replace the top element from the heap defined by bh_val[0..k-1] and
+ * bh_ids[0..k-1].
+ */
+template <class C>
+inline void heap_replace_top(
+        size_t k,
+        typename C::T* bh_val,
+        typename C::TI* bh_ids,
+        typename C::T val,
+        typename C::TI ids) {
+    bh_val--; /* Use 1-based indexing for easier node->child translation */
+    bh_ids--;
+    size_t i = 1, i1, i2;
+    while (1) {
+        i1 = i << 1;
+        i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
+            if (C::cmp(val, bh_val[i1]))
+                break;
+            bh_val[i] = bh_val[i1];
+            bh_ids[i] = bh_ids[i1];
+            i = i1;
+        } else {
+            if (C::cmp(val, bh_val[i2]))
+                break;
+            bh_val[i] = bh_val[i2];
+            bh_ids[i] = bh_ids[i2];
+            i = i2;
+        }
+    }
+    bh_val[i] = val;
+    bh_ids[i] = ids;
+}
+
+/* Partial instanciation for heaps with TI = int64_t */
+
+template <typename T>
+inline void minheap_pop(size_t k, T* bh_val, int64_t* bh_ids) {
+    heap_pop<CMin<T, int64_t>>(k, bh_val, bh_ids);
+}
+
+template <typename T>
+inline void minheap_push(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        T val,
+        int64_t ids) {
+    heap_push<CMin<T, int64_t>>(k, bh_val, bh_ids, val, ids);
+}
+
+template <typename T>
+inline void minheap_replace_top(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        T val,
+        int64_t ids) {
+    heap_replace_top<CMin<T, int64_t>>(k, bh_val, bh_ids, val, ids);
+}
+
+template <typename T>
+inline void maxheap_pop(size_t k, T* bh_val, int64_t* bh_ids) {
+    heap_pop<CMax<T, int64_t>>(k, bh_val, bh_ids);
+}
+
+template <typename T>
+inline void maxheap_push(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        T val,
+        int64_t ids) {
+    heap_push<CMax<T, int64_t>>(k, bh_val, bh_ids, val, ids);
+}
+
+template <typename T>
+inline void maxheap_replace_top(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        T val,
+        int64_t ids) {
+    heap_replace_top<CMax<T, int64_t>>(k, bh_val, bh_ids, val, ids);
+}
+
+/*******************************************************************
+ * Heap initialization
+ *******************************************************************/
+
+/* Initialization phase for the heap (with unconditionnal pushes).
+ * Store k0 elements in a heap containing up to k values. Note that
+ * (bh_val, bh_ids) can be the same as (x, ids) */
+template <class C>
+inline void heap_heapify(
+        size_t k,
+        typename C::T* bh_val,
+        typename C::TI* bh_ids,
+        const typename C::T* x = nullptr,
+        const typename C::TI* ids = nullptr,
+        size_t k0 = 0) {
+    if (k0 > 0)
+        assert(x);
+
+    if (ids) {
+        for (size_t i = 0; i < k0; i++)
+            heap_push<C>(i + 1, bh_val, bh_ids, x[i], ids[i]);
+    } else {
+        for (size_t i = 0; i < k0; i++)
+            heap_push<C>(i + 1, bh_val, bh_ids, x[i], i);
+    }
+
+    for (size_t i = k0; i < k; i++) {
+        bh_val[i] = C::neutral();
+        bh_ids[i] = -1;
+    }
+}
+
+template <typename T>
+inline void minheap_heapify(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        const T* x = nullptr,
+        const int64_t* ids = nullptr,
+        size_t k0 = 0) {
+    heap_heapify<CMin<T, int64_t>>(k, bh_val, bh_ids, x, ids, k0);
+}
+
+template <typename T>
+inline void maxheap_heapify(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        const T* x = nullptr,
+        const int64_t* ids = nullptr,
+        size_t k0 = 0) {
+    heap_heapify<CMax<T, int64_t>>(k, bh_val, bh_ids, x, ids, k0);
+}
+
+/*******************************************************************
+ * Add n elements to the heap
+ *******************************************************************/
+
+/* Add some elements to the heap  */
+template <class C>
+inline void heap_addn(
+        size_t k,
+        typename C::T* bh_val,
+        typename C::TI* bh_ids,
+        const typename C::T* x,
+        const typename C::TI* ids,
+        size_t n) {
+    size_t i;
+    if (ids)
+        for (i = 0; i < n; i++) {
+            if (C::cmp(bh_val[0], x[i])) {
+                heap_replace_top<C>(k, bh_val, bh_ids, x[i], ids[i]);
+            }
+        }
+    else
+        for (i = 0; i < n; i++) {
+            if (C::cmp(bh_val[0], x[i])) {
+                heap_replace_top<C>(k, bh_val, bh_ids, x[i], i);
+            }
+        }
+}
+
+/* Partial instanciation for heaps with TI = int64_t */
+
+template <typename T>
+inline void minheap_addn(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        const T* x,
+        const int64_t* ids,
+        size_t n) {
+    heap_addn<CMin<T, int64_t>>(k, bh_val, bh_ids, x, ids, n);
+}
+
+template <typename T>
+inline void maxheap_addn(
+        size_t k,
+        T* bh_val,
+        int64_t* bh_ids,
+        const T* x,
+        const int64_t* ids,
+        size_t n) {
+    heap_addn<CMax<T, int64_t>>(k, bh_val, bh_ids, x, ids, n);
+}
+
+/*******************************************************************
+ * Heap finalization (reorder elements)
+ *******************************************************************/
+
+/* This function maps a binary heap into an sorted structure.
+   It returns the number  */
+template <typename C>
+inline size_t heap_reorder(
+        size_t k,
+        typename C::T* bh_val,
+        typename C::TI* bh_ids) {
+    size_t i, ii;
+
+    for (i = 0, ii = 0; i < k; i++) {
+        /* top element should be put at the end of the list */
+        typename C::T val = bh_val[0];
+        typename C::TI id = bh_ids[0];
+
+        /* boundary case: we will over-ride this value if not a true element */
+        heap_pop<C>(k - i, bh_val, bh_ids);
+        bh_val[k - ii - 1] = val;
+        bh_ids[k - ii - 1] = id;
+        if (id != -1)
+            ii++;
+    }
+    /* Count the number of elements which are effectively returned */
+    size_t nel = ii;
+
+    memmove(bh_val, bh_val + k - ii, ii * sizeof(*bh_val));
+    memmove(bh_ids, bh_ids + k - ii, ii * sizeof(*bh_ids));
+
+    for (; ii < k; ii++) {
+        bh_val[ii] = C::neutral();
+        bh_ids[ii] = -1;
+    }
+    return nel;
+}
+
+template <typename T>
+inline size_t minheap_reorder(size_t k, T* bh_val, int64_t* bh_ids) {
+    return heap_reorder<CMin<T, int64_t>>(k, bh_val, bh_ids);
+}
+
+template <typename T>
+inline size_t maxheap_reorder(size_t k, T* bh_val, int64_t* bh_ids) {
+    return heap_reorder<CMax<T, int64_t>>(k, bh_val, bh_ids);
+}
+
+/*******************************************************************
+ * Operations on heap arrays
+ *******************************************************************/
+
+/** a template structure for a set of [min|max]-heaps it is tailored
+ * so that the actual data of the heaps can just live in compact
+ * arrays.
+ */
+template <typename C>
+struct HeapArray {
+    typedef typename C::TI TI;
+    typedef typename C::T T;
+
+    size_t nh; ///< number of heaps
+    size_t k;  ///< allocated size per heap
+    TI* ids;   ///< identifiers (size nh * k)
+    T* val;    ///< values (distances or similarities), size nh * k
+
+    /// Return the list of values for a heap
+    T* get_val(size_t key) {
+        return val + key * k;
+    }
+
+    /// Correspponding identifiers
+    TI* get_ids(size_t key) {
+        return ids + key * k;
+    }
+
+    /// prepare all the heaps before adding
+    void heapify();
+
+    /** add nj elements to heaps i0:i0+ni, with sequential ids
+     *
+     * @param nj    nb of elements to add to each heap
+     * @param vin   elements to add, size ni * nj
+     * @param j0    add this to the ids that are added
+     * @param i0    first heap to update
+     * @param ni    nb of elements to update (-1 = use nh)
+     */
+    void addn(
+            size_t nj,
+            const T* vin,
+            TI j0 = 0,
+            size_t i0 = 0,
+            int64_t ni = -1);
+
+    /** same as addn
+     *
+     * @param id_in     ids of the elements to add, size ni * nj
+     * @param id_stride stride for id_in
+     */
+    void addn_with_ids(
+            size_t nj,
+            const T* vin,
+            const TI* id_in = nullptr,
+            int64_t id_stride = 0,
+            size_t i0 = 0,
+            int64_t ni = -1);
+
+    /// reorder all the heaps
+    void reorder();
+
+    /** this is not really a heap function. It just finds the per-line
+     *   extrema of each line of array D
+     * @param vals_out    extreme value of each line (size nh, or NULL)
+     * @param idx_out     index of extreme value (size nh or NULL)
+     */
+    void per_line_extrema(T* vals_out, TI* idx_out) const;
+};
+
+/* Define useful heaps */
+typedef HeapArray<CMin<float, int64_t>> float_minheap_array_t;
+typedef HeapArray<CMin<int, int64_t>> int_minheap_array_t;
+
+typedef HeapArray<CMax<float, int64_t>> float_maxheap_array_t;
+typedef HeapArray<CMax<int, int64_t>> int_maxheap_array_t;
+
+// The heap templates are instanciated explicitly in Heap.cpp
+
+/*********************************************************************
+ * Indirect heaps: instead of having
+ *
+ *          node i = (bh_ids[i], bh_val[i]),
+ *
+ * in indirect heaps,
+ *
+ *          node i = (bh_ids[i], bh_val[bh_ids[i]]),
+ *
+ *********************************************************************/
+
+template <class C>
+inline void indirect_heap_pop(
+        size_t k,
+        const typename C::T* bh_val,
+        typename C::TI* bh_ids) {
+    bh_ids--; /* Use 1-based indexing for easier node->child translation */
+    typename C::T val = bh_val[bh_ids[k]];
+    size_t i = 1;
+    while (1) {
+        size_t i1 = i << 1;
+        size_t i2 = i1 + 1;
+        if (i1 > k)
+            break;
+        typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
+        if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
+            if (C::cmp(val, bh_val[id1]))
+                break;
+            bh_ids[i] = id1;
+            i = i1;
+        } else {
+            if (C::cmp(val, bh_val[id2]))
+                break;
+            bh_ids[i] = id2;
+            i = i2;
+        }
+    }
+    bh_ids[i] = bh_ids[k];
+}
+
+template <class C>
+inline void indirect_heap_push(
+        size_t k,
+        const typename C::T* bh_val,
+        typename C::TI* bh_ids,
+        typename C::TI id) {
+    bh_ids--; /* Use 1-based indexing for easier node->child translation */
+    typename C::T val = bh_val[id];
+    size_t i = k;
+    while (i > 1) {
+        size_t i_father = i >> 1;
+        if (!C::cmp(val, bh_val[bh_ids[i_father]]))
+            break;
+        bh_ids[i] = bh_ids[i_father];
+        i = i_father;
+    }
+    bh_ids[i] = id;
+}
+
+} // namespace faiss
+
+#endif /* FAISS_Heap_h */
--- a/src/3rdlib/faiss/utils/WorkerThread.h
+++ b/src/3rdlib/faiss/utils/WorkerThread.h
@ -0,0 +1,60 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <future>
+#include <thread>
+
+namespace faiss {
+
+class WorkerThread {
+   public:
+    WorkerThread();
+
+    /// Stops and waits for the worker thread to exit, flushing all
+    /// pending lambdas
+    ~WorkerThread();
+
+    /// Request that the worker thread stop itself
+    void stop();
+
+    /// Blocking waits in the current thread for the worker thread to
+    /// stop
+    void waitForThreadExit();
+
+    /// Adds a lambda to run on the worker thread; returns a future that
+    /// can be used to block on its completion.
+    /// Future status is `true` if the lambda was run in the worker
+    /// thread; `false` if it was not run, because the worker thread is
+    /// exiting or has exited.
+    std::future<bool> add(std::function<void()> f);
+
+   private:
+    void startThread();
+    void threadMain();
+    void threadLoop();
+
+    /// Thread that all queued lambdas are run on
+    std::thread thread_;
+
+    /// Mutex for the queue and exit status
+    std::mutex mutex_;
+
+    /// Monitor for the exit status and the queue
+    std::condition_variable monitor_;
+
+    /// Whether or not we want the thread to exit
+    bool wantStop_;
+
+    /// Queue of pending lambdas to call
+    std::deque<std::pair<std::function<void()>, std::promise<bool>>> queue_;
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/distances.h
+++ b/src/3rdlib/faiss/utils/distances.h
@ -0,0 +1,300 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/* All distance functions for L2 and IP distances.
+ * The actual functions are implemented in distances.cpp and distances_simd.cpp
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+/*********************************************************
+ * Optimized distance/norm/inner prod computations
+ *********************************************************/
+
+/// Squared L2 distance between two vectors
+float fvec_L2sqr(const float* x, const float* y, size_t d);
+
+/// inner product
+float fvec_inner_product(const float* x, const float* y, size_t d);
+
+/// L1 distance
+float fvec_L1(const float* x, const float* y, size_t d);
+
+/// infinity distance
+float fvec_Linf(const float* x, const float* y, size_t d);
+
+/** Compute pairwise distances between sets of vectors
+ *
+ * @param d     dimension of the vectors
+ * @param nq    nb of query vectors
+ * @param nb    nb of database vectors
+ * @param xq    query vectors (size nq * d)
+ * @param xb    database vectors (size nb * d)
+ * @param dis   output distances (size nq * nb)
+ * @param ldq,ldb, ldd strides for the matrices
+ */
+void pairwise_L2sqr(
+        int64_t d,
+        int64_t nq,
+        const float* xq,
+        int64_t nb,
+        const float* xb,
+        float* dis,
+        int64_t ldq = -1,
+        int64_t ldb = -1,
+        int64_t ldd = -1);
+
+/* compute the inner product between nx vectors x and one y */
+void fvec_inner_products_ny(
+        float* ip, /* output inner product */
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny);
+
+/* compute ny square L2 distance between x and a set of contiguous y vectors */
+void fvec_L2sqr_ny(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny);
+
+/** squared norm of a vector */
+float fvec_norm_L2sqr(const float* x, size_t d);
+
+/** compute the L2 norms for a set of vectors
+ *
+ * @param  norms    output norms, size nx
+ * @param  x        set of vectors, size nx * d
+ */
+void fvec_norms_L2(float* norms, const float* x, size_t d, size_t nx);
+
+/// same as fvec_norms_L2, but computes squared norms
+void fvec_norms_L2sqr(float* norms, const float* x, size_t d, size_t nx);
+
+/* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
+void fvec_renorm_L2(size_t d, size_t nx, float* x);
+
+/* This function exists because the Torch counterpart is extremely slow
+   (not multi-threaded + unexpected overhead even in single thread).
+   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
+void inner_product_to_L2sqr(
+        float* dis,
+        const float* nr1,
+        const float* nr2,
+        size_t n1,
+        size_t n2);
+
+/*********************************************************
+ * Vector to vector functions
+ *********************************************************/
+
+/** compute c := a + b for vectors
+ *
+ * c and a can overlap, c and b can overlap
+ *
+ * @param a size d
+ * @param b size d
+ * @param c size d
+ */
+void fvec_add(size_t d, const float* a, const float* b, float* c);
+
+/** compute c := a + b for a, c vectors and b a scalar
+ *
+ * c and a can overlap
+ *
+ * @param a size d
+ * @param c size d
+ */
+void fvec_add(size_t d, const float* a, float b, float* c);
+
+/** compute c := a - b for vectors
+ *
+ * c and a can overlap, c and b can overlap
+ *
+ * @param a size d
+ * @param b size d
+ * @param c size d
+ */
+void fvec_sub(size_t d, const float* a, const float* b, float* c);
+
+/***************************************************************************
+ * Compute a subset of  distances
+ ***************************************************************************/
+
+/* compute the inner product between x and a subset y of ny vectors,
+  whose indices are given by idy.  */
+void fvec_inner_products_by_idx(
+        float* ip,
+        const float* x,
+        const float* y,
+        const int64_t* ids,
+        size_t d,
+        size_t nx,
+        size_t ny);
+
+/* same but for a subset in y indexed by idsy (ny vectors in total) */
+void fvec_L2sqr_by_idx(
+        float* dis,
+        const float* x,
+        const float* y,
+        const int64_t* ids, /* ids of y vecs */
+        size_t d,
+        size_t nx,
+        size_t ny);
+
+/** compute dis[j] = L2sqr(x[ix[j]], y[iy[j]]) forall j=0..n-1
+ *
+ * @param x  size (max(ix) + 1, d)
+ * @param y  size (max(iy) + 1, d)
+ * @param ix size n
+ * @param iy size n
+ * @param dis size n
+ */
+void pairwise_indexed_L2sqr(
+        size_t d,
+        size_t n,
+        const float* x,
+        const int64_t* ix,
+        const float* y,
+        const int64_t* iy,
+        float* dis);
+
+/* same for inner product */
+void pairwise_indexed_inner_product(
+        size_t d,
+        size_t n,
+        const float* x,
+        const int64_t* ix,
+        const float* y,
+        const int64_t* iy,
+        float* dis);
+
+/***************************************************************************
+ * KNN functions
+ ***************************************************************************/
+
+// threshold on nx above which we switch to BLAS to compute distances
+FAISS_API extern int distance_compute_blas_threshold;
+
+// block sizes for BLAS distance computations
+FAISS_API extern int distance_compute_blas_query_bs;
+FAISS_API extern int distance_compute_blas_database_bs;
+
+// above this number of results we switch to a reservoir to collect results
+// rather than a heap
+FAISS_API extern int distance_compute_min_k_reservoir;
+
+/** Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, w.r.t to max inner product
+ *
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size ny * d
+ * @param res  result array, which also provides k. Sorted on output
+ */
+void knn_inner_product(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        float_minheap_array_t* res);
+
+/** Same as knn_inner_product, for the L2 distance
+ *  @param y_norm2    norms for the y vectors (nullptr or size ny)
+ */
+void knn_L2sqr(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        float_maxheap_array_t* res,
+        const float* y_norm2 = nullptr);
+
+/* Find the nearest neighbors for nx queries in a set of ny vectors
+ * indexed by ids. May be useful for re-ranking a pre-selected vector list
+ */
+void knn_inner_products_by_idx(
+        const float* x,
+        const float* y,
+        const int64_t* ids,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        float_minheap_array_t* res);
+
+void knn_L2sqr_by_idx(
+        const float* x,
+        const float* y,
+        const int64_t* ids,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        float_maxheap_array_t* res);
+
+/***************************************************************************
+ * Range search
+ ***************************************************************************/
+
+/// Forward declaration, see AuxIndexStructures.h
+struct RangeSearchResult;
+
+/** Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, w.r.t to max inner product
+ *
+ * @param x      query vectors, size nx * d
+ * @param y      database vectors, size ny * d
+ * @param radius search radius around the x vectors
+ * @param result result structure
+ */
+void range_search_L2sqr(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        float radius,
+        RangeSearchResult* result);
+
+/// same as range_search_L2sqr for the inner product similarity
+void range_search_inner_product(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        float radius,
+        RangeSearchResult* result);
+
+/***************************************************************************
+ * PQ tables computations
+ ***************************************************************************/
+
+/// specialized function for PQ2
+void compute_PQ_dis_tables_dsub2(
+        size_t d,
+        size_t ksub,
+        const float* centroids,
+        size_t nx,
+        const float* x,
+        bool is_inner_product,
+        float* dis_tables);
+
+/***************************************************************************
+ * Templatized versions of distance functions
+ ***************************************************************************/
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/extra_distances-inl.h
+++ b/src/3rdlib/faiss/utils/extra_distances-inl.h
@ -0,0 +1,117 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/** In this file are the implementations of extra metrics beyond L2
+ *  and inner product */
+
+#include <faiss/utils/distances.h>
+#include <type_traits>
+
+namespace faiss {
+
+template <MetricType mt>
+struct VectorDistance {
+    size_t d;
+    float metric_arg;
+
+    inline float operator()(const float* x, const float* y) const;
+
+    // heap template to use for this type of metric
+    using C = typename std::conditional<
+            mt == METRIC_INNER_PRODUCT,
+            CMin<float, int64_t>,
+            CMax<float, int64_t>>::type;
+};
+
+template <>
+inline float VectorDistance<METRIC_L2>::operator()(
+        const float* x,
+        const float* y) const {
+    return fvec_L2sqr(x, y, d);
+}
+
+template <>
+inline float VectorDistance<METRIC_INNER_PRODUCT>::operator()(
+        const float* x,
+        const float* y) const {
+    return fvec_inner_product(x, y, d);
+}
+
+template <>
+inline float VectorDistance<METRIC_L1>::operator()(
+        const float* x,
+        const float* y) const {
+    return fvec_L1(x, y, d);
+}
+
+template <>
+inline float VectorDistance<METRIC_Linf>::operator()(
+        const float* x,
+        const float* y) const {
+    return fvec_Linf(x, y, d);
+    /*
+        float vmax = 0;
+        for (size_t i = 0; i < d; i++) {
+            float diff = fabs (x[i] - y[i]);
+            if (diff > vmax) vmax = diff;
+        }
+     return vmax;*/
+}
+
+template <>
+inline float VectorDistance<METRIC_Lp>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        float diff = fabs(x[i] - y[i]);
+        accu += powf(diff, metric_arg);
+    }
+    return accu;
+}
+
+template <>
+inline float VectorDistance<METRIC_Canberra>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        float xi = x[i], yi = y[i];
+        accu += fabs(xi - yi) / (fabs(xi) + fabs(yi));
+    }
+    return accu;
+}
+
+template <>
+inline float VectorDistance<METRIC_BrayCurtis>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu_num = 0, accu_den = 0;
+    for (size_t i = 0; i < d; i++) {
+        float xi = x[i], yi = y[i];
+        accu_num += fabs(xi - yi);
+        accu_den += fabs(xi + yi);
+    }
+    return accu_num / accu_den;
+}
+
+template <>
+inline float VectorDistance<METRIC_JensenShannon>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        float xi = x[i], yi = y[i];
+        float mi = 0.5 * (xi + yi);
+        float kl1 = -xi * log(mi / xi);
+        float kl2 = -yi * log(mi / yi);
+        accu += kl1 + kl2;
+    }
+    return 0.5 * accu;
+}
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/extra_distances.h
+++ b/src/3rdlib/faiss/utils/extra_distances.h
@ -0,0 +1,55 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/** In this file are the implementations of extra metrics beyond L2
+ *  and inner product */
+
+#include <stdint.h>
+
+#include <faiss/Index.h>
+
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+void pairwise_extra_distances(
+        int64_t d,
+        int64_t nq,
+        const float* xq,
+        int64_t nb,
+        const float* xb,
+        MetricType mt,
+        float metric_arg,
+        float* dis,
+        int64_t ldq = -1,
+        int64_t ldb = -1,
+        int64_t ldd = -1);
+
+void knn_extra_metrics(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        MetricType mt,
+        float metric_arg,
+        float_maxheap_array_t* res);
+
+/** get a DistanceComputer that refers to this type of distance and
+ *  indexes a flat array of size nb */
+DistanceComputer* get_extra_distance_computer(
+        size_t d,
+        MetricType mt,
+        float metric_arg,
+        size_t nb,
+        const float* xb);
+
+} // namespace faiss
+
+#include <faiss/utils/extra_distances-inl.h>
--- a/src/3rdlib/faiss/utils/hamming-inl.h
+++ b/src/3rdlib/faiss/utils/hamming-inl.h
@ -0,0 +1,521 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+namespace faiss {
+
+extern const uint8_t hamdis_tab_ham_bytes[256];
+
+inline BitstringWriter::BitstringWriter(uint8_t* code, size_t code_size)
+        : code(code), code_size(code_size), i(0) {
+    memset(code, 0, code_size);
+}
+
+inline void BitstringWriter::write(uint64_t x, int nbit) {
+    assert(code_size * 8 >= nbit + i);
+    // nb of available bits in i / 8
+    int na = 8 - (i & 7);
+
+    if (nbit <= na) {
+        code[i >> 3] |= x << (i & 7);
+        i += nbit;
+        return;
+    } else {
+        size_t j = i >> 3;
+        code[j++] |= x << (i & 7);
+        i += nbit;
+        x >>= na;
+        while (x != 0) {
+            code[j++] |= x;
+            x >>= 8;
+        }
+    }
+}
+
+inline BitstringReader::BitstringReader(const uint8_t* code, size_t code_size)
+        : code(code), code_size(code_size), i(0) {}
+
+inline uint64_t BitstringReader::read(int nbit) {
+    assert(code_size * 8 >= nbit + i);
+    // nb of available bits in i / 8
+    int na = 8 - (i & 7);
+    // get available bits in current byte
+    uint64_t res = code[i >> 3] >> (i & 7);
+    if (nbit <= na) {
+        res &= (1 << nbit) - 1;
+        i += nbit;
+        return res;
+    } else {
+        int ofs = na;
+        size_t j = (i >> 3) + 1;
+        i += nbit;
+        nbit -= na;
+        while (nbit > 8) {
+            res |= ((uint64_t)code[j++]) << ofs;
+            ofs += 8;
+            nbit -= 8; // TODO remove nbit
+        }
+        uint64_t last_byte = code[j];
+        last_byte &= (1 << nbit) - 1;
+        res |= last_byte << ofs;
+        return res;
+    }
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+};
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*(uint32_t*)(b + 2) ^ a2);
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    HammingComputerM8() {}
+
+    HammingComputerM8(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM4 {
+    const uint32_t* a;
+    int n;
+
+    HammingComputerM4() {}
+
+    HammingComputerM4(const uint8_t* a4, int code_size) {
+        set(a4, code_size);
+    }
+
+    void set(const uint8_t* a4, int code_size) {
+        assert(code_size % 4 == 0);
+        a = (uint32_t*)a4;
+        n = code_size / 4;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint32_t* b = (uint32_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+};
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template <int CODE_SIZE>
+struct HammingComputer : HammingComputerDefault {
+    HammingComputer(const uint8_t* a, int code_size)
+            : HammingComputerDefault(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                                    \
+    template <>                                                      \
+    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
+        HammingComputer(const uint8_t* a)                            \
+                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return generalized_hamming_64(*(uint64_t*)b ^ a0);
+    }
+};
+
+struct GenHammingComputer16 {
+    uint64_t a0, a1;
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return generalized_hamming_64(b[0] ^ a0) +
+                generalized_hamming_64(b[1] ^ a1);
+    }
+};
+
+struct GenHammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return generalized_hamming_64(b[0] ^ a0) +
+                generalized_hamming_64(b[1] ^ a1) +
+                generalized_hamming_64(b[2] ^ a2) +
+                generalized_hamming_64(b[3] ^ a3);
+    }
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += generalized_hamming_64(a[i] ^ b[i]);
+        return accu;
+    }
+};
+
+/** generalized Hamming distances (= count number of code bytes that
+    are the same) */
+void generalized_hammings_knn_hc(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered = true);
+
+/** This class maintains a list of best distances seen so far.
+ *
+ * Since the distances are in a limited range (0 to nbit), the
+ * object maintains one list per possible distance, and fills
+ * in only the n-first lists, such that the sum of sizes of the
+ * n lists is below k.
+ */
+template <class HammingComputer>
+struct HCounterState {
+    int* counters;
+    int64_t* ids_per_dis;
+
+    HammingComputer hc;
+    int thres;
+    int count_lt;
+    int count_eq;
+    int k;
+
+    HCounterState(
+            int* counters,
+            int64_t* ids_per_dis,
+            const uint8_t* x,
+            int d,
+            int k)
+            : counters(counters),
+              ids_per_dis(ids_per_dis),
+              hc(x, d / 8),
+              thres(d + 1),
+              count_lt(0),
+              count_eq(0),
+              k(k) {}
+
+    void update_counter(const uint8_t* y, size_t j) {
+        int32_t dis = hc.hamming(y);
+
+        if (dis <= thres) {
+            if (dis < thres) {
+                ids_per_dis[dis * k + counters[dis]++] = j;
+                ++count_lt;
+                while (count_lt == k && thres > 0) {
+                    --thres;
+                    count_eq = counters[thres];
+                    count_lt -= count_eq;
+                }
+            } else if (count_eq < k) {
+                ids_per_dis[dis * k + count_eq++] = j;
+                counters[dis] = count_eq;
+            }
+        }
+    }
+};
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/hamming.h
+++ b/src/3rdlib/faiss/utils/hamming.h
@ -0,0 +1,218 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * Hamming distances. The binary vector dimensionality should be a
+ * multiple of 8, as the elementary operations operate on bytes. If
+ * you need other sizes, just pad with 0s (this is done by function
+ * fvecs2bitvecs).
+ *
+ * User-defined type hamdis_t is used for distances because at this time
+ * it is still uncler clear how we will need to balance
+ * - flexibility in vector size (may need 16- or even 8-bit vectors)
+ * - memory usage
+ * - cache-misses when dealing with large volumes of data (fewer bits is better)
+ *
+ */
+
+#ifndef FAISS_hamming_h
+#define FAISS_hamming_h
+
+#include <stdint.h>
+
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/Heap.h>
+
+/* The Hamming distance type */
+typedef int32_t hamdis_t;
+
+namespace faiss {
+
+/**************************************************
+ * General bit vector functions
+ **************************************************/
+
+struct RangeSearchResult;
+
+void bitvec_print(const uint8_t* b, size_t d);
+
+/* Functions for casting vectors of regular types to compact bits.
+   They assume proper allocation done beforehand, meaning that b
+   should be be able to receive as many bits as x may produce.  */
+
+/* Makes an array of bits from the signs of a float array. The length
+   of the output array b is rounded up to byte size (allocate
+   accordingly) */
+void fvecs2bitvecs(const float* x, uint8_t* b, size_t d, size_t n);
+
+void bitvecs2fvecs(const uint8_t* b, float* x, size_t d, size_t n);
+
+void fvec2bitvec(const float* x, uint8_t* b, size_t d);
+
+/** Shuffle the bits from b(i, j) := a(i, order[j])
+ */
+void bitvec_shuffle(
+        size_t n,
+        size_t da,
+        size_t db,
+        const int* order,
+        const uint8_t* a,
+        uint8_t* b);
+
+/***********************************************
+ * Generic reader/writer for bit strings
+ ***********************************************/
+
+struct BitstringWriter {
+    uint8_t* code;
+    size_t code_size;
+    size_t i; // current bit offset
+
+    // code_size in bytes
+    BitstringWriter(uint8_t* code, size_t code_size);
+
+    // write the nbit low bits of x
+    void write(uint64_t x, int nbit);
+};
+
+struct BitstringReader {
+    const uint8_t* code;
+    size_t code_size;
+    size_t i;
+
+    // code_size in bytes
+    BitstringReader(const uint8_t* code, size_t code_size);
+
+    // read nbit bits from the code
+    uint64_t read(int nbit);
+};
+
+/**************************************************
+ * Hamming distance computation functions
+ **************************************************/
+
+FAISS_API extern size_t hamming_batch_size;
+
+inline int popcount64(uint64_t x) {
+    return __builtin_popcountl(x);
+}
+
+/** Compute a set of Hamming distances between na and nb binary vectors
+ *
+ * @param  a             size na * nbytespercode
+ * @param  b             size nb * nbytespercode
+ * @param  nbytespercode should be multiple of 8
+ * @param  dis           output distances, size na * nb
+ */
+void hammings(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t nbytespercode,
+        hamdis_t* dis);
+
+/** Return the k smallest Hamming distances for a set of binary query vectors,
+ * using a max heap.
+ * @param a       queries, size ha->nh * ncodes
+ * @param b       database, size nb * ncodes
+ * @param nb      number of database vectors
+ * @param ncodes  size of the binary codes (bytes)
+ * @param ordered if != 0: order the results by decreasing distance
+ *                (may be bottleneck for k/n > 0.01) */
+void hammings_knn_hc(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t ncodes,
+        int ordered);
+
+/* Legacy alias to hammings_knn_hc. */
+void hammings_knn(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t ncodes,
+        int ordered);
+
+/** Return the k smallest Hamming distances for a set of binary query vectors,
+ * using counting max.
+ * @param a       queries, size na * ncodes
+ * @param b       database, size nb * ncodes
+ * @param na      number of query vectors
+ * @param nb      number of database vectors
+ * @param k       number of vectors/distances to return
+ * @param ncodes  size of the binary codes (bytes)
+ * @param distances output distances from each query vector to its k nearest
+ *                neighbors
+ * @param labels  output ids of the k nearest neighbors to each query vector
+ */
+void hammings_knn_mc(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        size_t ncodes,
+        int32_t* distances,
+        int64_t* labels);
+
+/** same as hammings_knn except we are doing a range search with radius */
+void hamming_range_search(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t ncodes,
+        RangeSearchResult* result);
+
+/* Counting the number of matches or of cross-matches (without returning them)
+   For use with function that assume pre-allocated memory */
+void hamming_count_thres(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr);
+
+/* Return all Hamming distances/index passing a thres. Pre-allocation of output
+   is required. Use hamming_count_thres to determine the proper size. */
+size_t match_hamming_thres(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t* idx,
+        hamdis_t* dis);
+
+/* Cross-matching in a set of vectors */
+void crosshamming_count_thres(
+        const uint8_t* dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr);
+
+/* compute the Hamming distances between two codewords of nwords*64 bits */
+hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2, size_t nwords);
+
+} // namespace faiss
+
+// inlined definitions of HammingComputerXX and GenHammingComputerXX
+
+#include <faiss/utils/hamming-inl.h>
+
+#endif /* FAISS_hamming_h */
--- a/src/3rdlib/faiss/utils/ordered_key_value.h
+++ b/src/3rdlib/faiss/utils/ordered_key_value.h
@ -0,0 +1,96 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <climits>
+#include <cmath>
+
+#include <limits>
+
+namespace faiss {
+
+/*******************************************************************
+ * C object: uniform handling of min and max heap
+ *******************************************************************/
+
+/** The C object gives the type T of the values of a key-value storage, the type
+ *  of the keys, TI and the comparison that is done: CMax for a decreasing
+ *  series and CMin for increasing series. In other words, for a given threshold
+ *  threshold, an incoming value x is kept if
+ *
+ *      C::cmp(threshold, x)
+ *
+ *  is true.
+ */
+
+template <typename T_, typename TI_>
+struct CMax;
+
+template <typename T>
+inline T cmin_nextafter(T x);
+template <typename T>
+inline T cmax_nextafter(T x);
+
+// traits of minheaps = heaps where the minimum value is stored on top
+// useful to find the *max* values of an array
+template <typename T_, typename TI_>
+struct CMin {
+    typedef T_ T;
+    typedef TI_ TI;
+    typedef CMax<T_, TI_> Crev; // reference to reverse comparison
+    inline static bool cmp(T a, T b) {
+        return a < b;
+    }
+    inline static T neutral() {
+        return std::numeric_limits<T>::lowest();
+    }
+    static const bool is_max = false;
+
+    inline static T nextafter(T x) {
+        return cmin_nextafter(x);
+    }
+};
+
+template <typename T_, typename TI_>
+struct CMax {
+    typedef T_ T;
+    typedef TI_ TI;
+    typedef CMin<T_, TI_> Crev;
+    inline static bool cmp(T a, T b) {
+        return a > b;
+    }
+    inline static T neutral() {
+        return std::numeric_limits<T>::max();
+    }
+    static const bool is_max = true;
+    inline static T nextafter(T x) {
+        return cmax_nextafter(x);
+    }
+};
+
+template <>
+inline float cmin_nextafter<float>(float x) {
+    return std::nextafterf(x, -HUGE_VALF);
+}
+
+template <>
+inline float cmax_nextafter<float>(float x) {
+    return std::nextafterf(x, HUGE_VALF);
+}
+
+template <>
+inline uint16_t cmin_nextafter<uint16_t>(uint16_t x) {
+    return x - 1;
+}
+
+template <>
+inline uint16_t cmax_nextafter<uint16_t>(uint16_t x) {
+    return x + 1;
+}
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/partitioning.h
+++ b/src/3rdlib/faiss/utils/partitioning.h
@ -0,0 +1,74 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/** partitions the table into 0:q and q:n where all elements above q are >= all
+ * elements below q (for C = CMax, for CMin comparisons are reversed)
+ *
+ * Returns the partition threshold. The elements q:n are destroyed on output.
+ */
+template <class C>
+typename C::T partition_fuzzy(
+        typename C::T* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
+
+/** simplified interface for when the parition is not fuzzy */
+template <class C>
+inline typename C::T partition(
+        typename C::T* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q) {
+    return partition_fuzzy<C>(vals, ids, n, q, q, nullptr);
+}
+
+/** low level SIMD histogramming functions */
+
+/** 8-bin histogram of (x - min) >> shift
+ * values outside the range are ignored.
+ * the data table should be aligned on 32 bytes */
+void simd_histogram_8(
+        const uint16_t* data,
+        int n,
+        uint16_t min,
+        int shift,
+        int* hist);
+
+/** same for 16-bin histogram */
+void simd_histogram_16(
+        const uint16_t* data,
+        int n,
+        uint16_t min,
+        int shift,
+        int* hist);
+
+struct PartitionStats {
+    uint64_t bissect_cycles;
+    uint64_t compress_cycles;
+
+    PartitionStats() {
+        reset();
+    }
+    void reset();
+};
+
+// global var that collects them all
+FAISS_API extern PartitionStats partition_stats;
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/quantize_lut.h
+++ b/src/3rdlib/faiss/utils/quantize_lut.h
@ -0,0 +1,82 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+
+namespace faiss {
+
+/** Functions to quantize PQ floating-point Look Up Tables (LUT) to uint8, and
+ * biases to uint16. The accumulation is supposed to take place in uint16.
+ * The quantization coefficients are float (a, b) such that
+ *
+ *      original_value = quantized_value * a / b
+ *
+ * The hardest part of the quantization is with multiple LUTs that need to be
+ * added up together. In that case, coefficient a has to be chosen so that
+ * the sum fits in a uint16 accumulator.
+ */
+
+namespace quantize_lut {
+
+/* affine quantizer, a and b are the affine coefficients, marginalize over d
+ *
+ * @param tab input/output, size (n, d)
+ */
+void round_uint8_per_column(
+        float* tab,
+        size_t n,
+        size_t d,
+        float* a_out = nullptr,
+        float* b_out = nullptr);
+
+/* affine quantizer, a and b are the affine coefficients
+ *
+ * @param tab input/output, size (m, n, d)
+ */
+void round_uint8_per_column_multi(
+        float* tab,
+        size_t m,
+        size_t n,
+        size_t d,
+        float* a_out = nullptr,
+        float* b_out = nullptr);
+
+/** LUT quantization to uint8 and bias to uint16.
+ *
+ * (nprobe, M, ksub, lut_is_3d) determine the size of the the LUT
+ *
+ *  LUT input:
+ *  - 2D size (M, ksub): single matrix per probe (lut_is_3d=false)
+ *  - 3D size (nprobe, M, ksub): separate LUT per probe (lut_is_3d=true)
+ *  bias input:
+ *  - nullptr: bias is 0
+ *  - size (nprobe): one bias per probe
+ *  Output:
+ *  - LUTq uint8 version of the LUT (M size is rounded up to M2)
+ *  - biasq (or nullptr): uint16 version of the LUT
+ *  - a, b: scalars to approximate the true distance
+ */
+
+void quantize_LUT_and_bias(
+        size_t nprobe,
+        size_t M,
+        size_t ksub,
+        bool lut_is_3d,
+        const float* LUT,
+        const float* bias,
+        uint8_t* LUTq,
+        size_t M2,
+        uint16_t* biasq,
+        float* a_out = nullptr,
+        float* b_out = nullptr);
+
+} // namespace quantize_lut
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/random.h
+++ b/src/3rdlib/faiss/utils/random.h
@ -0,0 +1,57 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/* Random generators. Implemented here for speed and to make
+ * sequences reproducible.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <random>
+
+namespace faiss {
+
+/**************************************************
+ * Random data generation functions
+ **************************************************/
+
+/// random generator that can be used in multithreaded contexts
+struct RandomGenerator {
+    std::mt19937 mt;
+
+    /// random positive integer
+    int rand_int();
+
+    /// random int64_t
+    int64_t rand_int64();
+
+    /// generate random integer between 0 and max-1
+    int rand_int(int max);
+
+    /// between 0 and 1
+    float rand_float();
+
+    double rand_double();
+
+    explicit RandomGenerator(int64_t seed = 1234);
+};
+
+/* Generate an array of uniform random floats / multi-threaded implementation */
+void float_rand(float* x, size_t n, int64_t seed);
+void float_randn(float* x, size_t n, int64_t seed);
+void int64_rand(int64_t* x, size_t n, int64_t seed);
+void byte_rand(uint8_t* x, size_t n, int64_t seed);
+// max is actually the maximum value + 1
+void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
+
+/* random permutation */
+void rand_perm(int* perm, size_t n, int64_t seed);
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/simdlib.h
+++ b/src/3rdlib/faiss/utils/simdlib.h
@ -0,0 +1,33 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/** Abstractions for 256-bit registers
+ *
+ * The objective is to separate the different interpretations of the same
+ * registers (as a vector of uint8, uint16 or uint32), to provide printing
+ * functions.
+ */
+
+#ifdef __AVX2__
+
+#include <faiss/utils/simdlib_avx2.h>
+
+#elif defined(__aarch64__)
+
+#include <faiss/utils/simdlib_neon.h>
+
+#else
+
+// emulated = all operations are implemented as scalars
+#include <faiss/utils/simdlib_emulated.h>
+
+// FIXME: make a SSE version
+// is this ever going to happen? We will probably rather implement AVX512
+
+#endif
--- a/src/3rdlib/faiss/utils/simdlib_avx2.h
+++ b/src/3rdlib/faiss/utils/simdlib_avx2.h
@ -0,0 +1,464 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include <immintrin.h>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/** Simple wrapper around the AVX 256-bit registers
+ *
+ * The objective is to separate the different interpretations of the same
+ * registers (as a vector of uint8, uint16 or uint32), to provide printing
+ * functions, and to give more readable names to the AVX intrinsics. It does not
+ * pretend to be exhausitve, functions are added as needed.
+ */
+
+/// 256-bit representation without interpretation as a vector
+struct simd256bit {
+    union {
+        __m256i i;
+        __m256 f;
+    };
+
+    simd256bit() {}
+
+    explicit simd256bit(__m256i i) : i(i) {}
+
+    explicit simd256bit(__m256 f) : f(f) {}
+
+    explicit simd256bit(const void* x)
+            : i(_mm256_load_si256((__m256i const*)x)) {}
+
+    void clear() {
+        i = _mm256_setzero_si256();
+    }
+
+    void storeu(void* ptr) const {
+        _mm256_storeu_si256((__m256i*)ptr, i);
+    }
+
+    void loadu(const void* ptr) {
+        i = _mm256_loadu_si256((__m256i*)ptr);
+    }
+
+    void store(void* ptr) const {
+        _mm256_store_si256((__m256i*)ptr, i);
+    }
+
+    void bin(char bits[257]) const {
+        char bytes[32];
+        storeu((void*)bytes);
+        for (int i = 0; i < 256; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[256] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+};
+
+/// vector of 16 elements in uint16
+struct simd16uint16 : simd256bit {
+    simd16uint16() {}
+
+    explicit simd16uint16(__m256i i) : simd256bit(i) {}
+
+    explicit simd16uint16(int x) : simd256bit(_mm256_set1_epi16(x)) {}
+
+    explicit simd16uint16(uint16_t x) : simd256bit(_mm256_set1_epi16(x)) {}
+
+    explicit simd16uint16(simd256bit x) : simd256bit(x) {}
+
+    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint16_t bytes[16];
+        storeu((void*)bytes);
+        char res[1000];
+        char* ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint16_t x) {
+        i = _mm256_set1_epi16((short)x);
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator>>(const int shift) const {
+        return simd16uint16(_mm256_srli_epi16(i, shift));
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator<<(const int shift) const {
+        return simd16uint16(_mm256_slli_epi16(i, shift));
+    }
+
+    simd16uint16 operator+=(simd16uint16 other) {
+        i = _mm256_add_epi16(i, other.i);
+        return *this;
+    }
+
+    simd16uint16 operator-=(simd16uint16 other) {
+        i = _mm256_sub_epi16(i, other.i);
+        return *this;
+    }
+
+    simd16uint16 operator+(simd16uint16 other) const {
+        return simd16uint16(_mm256_add_epi16(i, other.i));
+    }
+
+    simd16uint16 operator-(simd16uint16 other) const {
+        return simd16uint16(_mm256_sub_epi16(i, other.i));
+    }
+
+    simd16uint16 operator&(simd256bit other) const {
+        return simd16uint16(_mm256_and_si256(i, other.i));
+    }
+
+    simd16uint16 operator|(simd256bit other) const {
+        return simd16uint16(_mm256_or_si256(i, other.i));
+    }
+
+    // returns binary masks
+    simd16uint16 operator==(simd256bit other) const {
+        return simd16uint16(_mm256_cmpeq_epi16(i, other.i));
+    }
+
+    simd16uint16 operator~() const {
+        return simd16uint16(_mm256_xor_si256(i, _mm256_set1_epi32(-1)));
+    }
+
+    // get scalar at index 0
+    uint16_t get_scalar_0() const {
+        return _mm256_extract_epi16(i, 0);
+    }
+
+    // mask of elements where this >= thresh
+    // 2 bit per component: 16 * 2 = 32 bit
+    uint32_t ge_mask(simd16uint16 thresh) const {
+        __m256i j = thresh.i;
+        __m256i max = _mm256_max_epu16(i, j);
+        __m256i ge = _mm256_cmpeq_epi16(i, max);
+        return _mm256_movemask_epi8(ge);
+    }
+
+    uint32_t le_mask(simd16uint16 thresh) const {
+        return thresh.ge_mask(*this);
+    }
+
+    uint32_t gt_mask(simd16uint16 thresh) const {
+        return ~le_mask(thresh);
+    }
+
+    bool all_gt(simd16uint16 thresh) const {
+        return le_mask(thresh) == 0;
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        ALIGNED(32) uint16_t tab[16];
+        store(tab);
+        return tab[i];
+    }
+
+    void accu_min(simd16uint16 incoming) {
+        i = _mm256_min_epu16(i, incoming.i);
+    }
+
+    void accu_max(simd16uint16 incoming) {
+        i = _mm256_max_epu16(i, incoming.i);
+    }
+};
+
+// not really a std::min because it returns an elementwise min
+inline simd16uint16 min(simd16uint16 a, simd16uint16 b) {
+    return simd16uint16(_mm256_min_epu16(a.i, b.i));
+}
+
+inline simd16uint16 max(simd16uint16 a, simd16uint16 b) {
+    return simd16uint16(_mm256_max_epu16(a.i, b.i));
+}
+
+// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
+// return (a0 + a1, b0 + b1)
+// TODO find a better name
+inline simd16uint16 combine2x2(simd16uint16 a, simd16uint16 b) {
+    __m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
+    __m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
+
+    return simd16uint16(a1b0) + simd16uint16(a0b1);
+}
+
+// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
+// of d0 and d1 with thr
+inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
+    __m256i max0 = _mm256_max_epu16(d0.i, thr.i);
+    __m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
+
+    __m256i max1 = _mm256_max_epu16(d1.i, thr.i);
+    __m256i ge1 = _mm256_cmpeq_epi16(d1.i, max1);
+
+    __m256i ge01 = _mm256_packs_epi16(ge0, ge1);
+
+    // easier than manipulating bit fields afterwards
+    ge01 = _mm256_permute4x64_epi64(ge01, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    uint32_t ge = _mm256_movemask_epi8(ge01);
+
+    return ge;
+}
+
+inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
+    __m256i max0 = _mm256_min_epu16(d0.i, thr.i);
+    __m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
+
+    __m256i max1 = _mm256_min_epu16(d1.i, thr.i);
+    __m256i ge1 = _mm256_cmpeq_epi16(d1.i, max1);
+
+    __m256i ge01 = _mm256_packs_epi16(ge0, ge1);
+
+    // easier than manipulating bit fields afterwards
+    ge01 = _mm256_permute4x64_epi64(ge01, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    uint32_t ge = _mm256_movemask_epi8(ge01);
+
+    return ge;
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd32uint8 : simd256bit {
+    simd32uint8() {}
+
+    explicit simd32uint8(__m256i i) : simd256bit(i) {}
+
+    explicit simd32uint8(int x) : simd256bit(_mm256_set1_epi8(x)) {}
+
+    explicit simd32uint8(uint8_t x) : simd256bit(_mm256_set1_epi8(x)) {}
+
+    explicit simd32uint8(simd256bit x) : simd256bit(x) {}
+
+    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint8_t bytes[32];
+        storeu((void*)bytes);
+        char res[1000];
+        char* ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        i = _mm256_set1_epi8((char)x);
+    }
+
+    simd32uint8 operator&(simd256bit other) const {
+        return simd32uint8(_mm256_and_si256(i, other.i));
+    }
+
+    simd32uint8 operator+(simd32uint8 other) const {
+        return simd32uint8(_mm256_add_epi8(i, other.i));
+    }
+
+    simd32uint8 lookup_2_lanes(simd32uint8 idx) const {
+        return simd32uint8(_mm256_shuffle_epi8(i, idx.i));
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+    simd16uint16 lane0_as_uint16() const {
+        __m128i x = _mm256_extracti128_si256(i, 0);
+        return simd16uint16(_mm256_cvtepu8_epi16(x));
+    }
+
+    simd16uint16 lane1_as_uint16() const {
+        __m128i x = _mm256_extracti128_si256(i, 1);
+        return simd16uint16(_mm256_cvtepu8_epi16(x));
+    }
+
+    simd32uint8 operator+=(simd32uint8 other) {
+        i = _mm256_add_epi8(i, other.i);
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        ALIGNED(32) uint8_t tab[32];
+        store(tab);
+        return tab[i];
+    }
+};
+
+// convert with saturation
+// careful: this does not cross lanes, so the order is weird
+inline simd32uint8 uint16_to_uint8_saturate(simd16uint16 a, simd16uint16 b) {
+    return simd32uint8(_mm256_packs_epi16(a.i, b.i));
+}
+
+/// get most significant bit of each byte
+inline uint32_t get_MSBs(simd32uint8 a) {
+    return _mm256_movemask_epi8(a.i);
+}
+
+/// use MSB of each byte of mask to select a byte between a and b
+inline simd32uint8 blendv(simd32uint8 a, simd32uint8 b, simd32uint8 mask) {
+    return simd32uint8(_mm256_blendv_epi8(a.i, b.i, mask.i));
+}
+
+/// vector of 8 unsigned 32-bit integers
+struct simd8uint32 : simd256bit {
+    simd8uint32() {}
+
+    explicit simd8uint32(__m256i i) : simd256bit(i) {}
+
+    explicit simd8uint32(uint32_t x) : simd256bit(_mm256_set1_epi32(x)) {}
+
+    explicit simd8uint32(simd256bit x) : simd256bit(x) {}
+
+    explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint32_t bytes[8];
+        storeu((void*)bytes);
+        char res[1000];
+        char* ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%08x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%10d,");
+    }
+
+    void set1(uint32_t x) {
+        i = _mm256_set1_epi32((int)x);
+    }
+};
+
+struct simd8float32 : simd256bit {
+    simd8float32() {}
+
+    explicit simd8float32(simd256bit x) : simd256bit(x) {}
+
+    explicit simd8float32(__m256 x) : simd256bit(x) {}
+
+    explicit simd8float32(float x) : simd256bit(_mm256_set1_ps(x)) {}
+
+    explicit simd8float32(const float* x) : simd256bit(_mm256_load_ps(x)) {}
+
+    simd8float32 operator*(simd8float32 other) const {
+        return simd8float32(_mm256_mul_ps(f, other.f));
+    }
+
+    simd8float32 operator+(simd8float32 other) const {
+        return simd8float32(_mm256_add_ps(f, other.f));
+    }
+
+    simd8float32 operator-(simd8float32 other) const {
+        return simd8float32(_mm256_sub_ps(f, other.f));
+    }
+
+    std::string tostring() const {
+        float tab[8];
+        storeu((void*)tab);
+        char res[1000];
+        char* ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, "%g,", tab[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+
+inline simd8float32 hadd(simd8float32 a, simd8float32 b) {
+    return simd8float32(_mm256_hadd_ps(a.f, b.f));
+}
+
+inline simd8float32 unpacklo(simd8float32 a, simd8float32 b) {
+    return simd8float32(_mm256_unpacklo_ps(a.f, b.f));
+}
+
+inline simd8float32 unpackhi(simd8float32 a, simd8float32 b) {
+    return simd8float32(_mm256_unpackhi_ps(a.f, b.f));
+}
+
+// compute a * b + c
+inline simd8float32 fmadd(simd8float32 a, simd8float32 b, simd8float32 c) {
+    return simd8float32(_mm256_fmadd_ps(a.f, b.f, c.f));
+}
+
+namespace {
+
+// get even float32's of a and b, interleaved
+inline simd8float32 geteven(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+            _mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6));
+}
+
+// get odd float32's of a and b, interleaved
+inline simd8float32 getodd(simd8float32 a, simd8float32 b) {
+    return simd8float32(
+            _mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6));
+}
+
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+inline simd8float32 getlow128(simd8float32 a, simd8float32 b) {
+    return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4));
+}
+
+inline simd8float32 gethigh128(simd8float32 a, simd8float32 b) {
+    return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4));
+}
+
+} // namespace
+
+} // namespace faiss
--- a/src/3rdlib/faiss/utils/simdlib_emulated.h
+++ b/src/3rdlib/faiss/utils/simdlib_emulated.h
@ -0,0 +1,650 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+namespace faiss {
+
+struct simd256bit {
+    union {
+        uint8_t u8[32];
+        uint16_t u16[16];
+        uint32_t u32[8];
+        float f32[8];
+    };
+
+    simd256bit() {}
+
+    explicit simd256bit(const void* x) {
+        memcpy(u8, x, 32);
+    }
+
+    void clear() {
+        memset(u8, 0, 32);
+    }
+
+    void storeu(void* ptr) const {
+        memcpy(ptr, u8, 32);
+    }
+
+    void loadu(const void* ptr) {
+        memcpy(u8, ptr, 32);
+    }
+
+    void store(void* ptr) const {
+        storeu(ptr);
+    }
+
+    void bin(char bits[257]) const {
+        const char* bytes = (char*)this->u8;
+        for (int i = 0; i < 256; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[256] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+};
+
+/// vector of 16 elements in uint16
+struct simd16uint16 : simd256bit {
+    simd16uint16() {}
+
+    explicit simd16uint16(int x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(uint16_t x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, fmt, u16[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    template <typename F>
+    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j]);
+        }
+        return c;
+    }
+
+    template <typename F>
+    static simd16uint16 binary_func(
+            const simd16uint16& a,
+            const simd16uint16& b,
+            F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j], b.u16[j]);
+        }
+        return c;
+    }
+
+    void set1(uint16_t x) {
+        for (int i = 0; i < 16; i++) {
+            u16[i] = x;
+        }
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator>>(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator<<(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
+    }
+
+    simd16uint16 operator+=(const simd16uint16& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    simd16uint16 operator-=(const simd16uint16& other) {
+        *this = *this - other;
+        return *this;
+    }
+
+    simd16uint16 operator+(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
+    }
+
+    simd16uint16 operator-(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
+    }
+
+    simd16uint16 operator&(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a & b;
+                });
+    }
+
+    simd16uint16 operator|(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a | b;
+                });
+    }
+
+    // returns binary masks
+    simd16uint16 operator==(const simd16uint16& other) const {
+        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
+            return a == b ? 0xffff : 0;
+        });
+    }
+
+    simd16uint16 operator~() const {
+        return unary_func(*this, [](uint16_t a) { return ~a; });
+    }
+
+    // get scalar at index 0
+    uint16_t get_scalar_0() const {
+        return u16[0];
+    }
+
+    // mask of elements where this >= thresh
+    // 2 bit per component: 16 * 2 = 32 bit
+    uint32_t ge_mask(const simd16uint16& thresh) const {
+        uint32_t gem = 0;
+        for (int j = 0; j < 16; j++) {
+            if (u16[j] >= thresh.u16[j]) {
+                gem |= 3 << (j * 2);
+            }
+        }
+        return gem;
+    }
+
+    uint32_t le_mask(const simd16uint16& thresh) const {
+        return thresh.ge_mask(*this);
+    }
+
+    uint32_t gt_mask(const simd16uint16& thresh) const {
+        return ~le_mask(thresh);
+    }
+
+    bool all_gt(const simd16uint16& thresh) const {
+        return le_mask(thresh) == 0;
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        return u16[i];
+    }
+
+    void accu_min(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] < u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+
+    void accu_max(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] > u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+};
+
+// not really a std::min because it returns an elementwise min
+inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
+}
+
+inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
+}
+
+// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
+// return (a0 + a1, b0 + b1)
+// TODO find a better name
+inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    for (int j = 0; j < 8; j++) {
+        c.u16[j] = a.u16[j] + a.u16[j + 8];
+        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
+    }
+    return c;
+}
+
+// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
+// of d0 and d1 with thr
+inline uint32_t cmp_ge32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] >= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] >= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+inline uint32_t cmp_le32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] <= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] <= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd32uint8 : simd256bit {
+    simd32uint8() {}
+
+    explicit simd32uint8(int x) {
+        set1(x);
+    }
+
+    explicit simd32uint8(uint8_t x) {
+        set1(x);
+    }
+
+    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, u8[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        for (int j = 0; j < 32; j++) {
+            u8[j] = x;
+        }
+    }
+
+    template <typename F>
+    static simd32uint8 binary_func(
+            const simd32uint8& a,
+            const simd32uint8& b,
+            F&& f) {
+        simd32uint8 c;
+        for (int j = 0; j < 32; j++) {
+            c.u8[j] = f(a.u8[j], b.u8[j]);
+        }
+        return c;
+    }
+
+    simd32uint8 operator&(const simd256bit& other) const {
+        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
+            return a & b;
+        });
+    }
+
+    simd32uint8 operator+(const simd32uint8& other) const {
+        return binary_func(
+                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
+    }
+
+    // The very important operation that everything relies on
+    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
+        simd32uint8 c;
+        for (int j = 0; j < 32; j++) {
+            if (idx.u8[j] & 0x80) {
+                c.u8[j] = 0;
+            } else {
+                uint8_t i = idx.u8[j] & 15;
+                if (j < 16) {
+                    c.u8[j] = u8[i];
+                } else {
+                    c.u8[j] = u8[16 + i];
+                }
+            }
+        }
+        return c;
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+
+    simd32uint8 operator+=(const simd32uint8& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        return u8[i];
+    }
+};
+
+// convert with saturation
+// careful: this does not cross lanes, so the order is weird
+inline simd32uint8 uint16_to_uint8_saturate(
+        const simd16uint16& a,
+        const simd16uint16& b) {
+    simd32uint8 c;
+
+    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
+
+    for (int i = 0; i < 8; i++) {
+        c.u8[i] = saturate_16_to_8(a.u16[i]);
+        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
+        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
+        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
+    }
+    return c;
+}
+
+/// get most significant bit of each byte
+inline uint32_t get_MSBs(const simd32uint8& a) {
+    uint32_t res = 0;
+    for (int i = 0; i < 32; i++) {
+        if (a.u8[i] & 0x80) {
+            res |= 1 << i;
+        }
+    }
+    return res;
+}
+
+/// use MSB of each byte of mask to select a byte between a and b
+inline simd32uint8 blendv(
+        const simd32uint8& a,
+        const simd32uint8& b,
+        const simd32uint8& mask) {
+    simd32uint8 c;
+    for (int i = 0; i < 32; i++) {
+        if (mask.u8[i] & 0x80) {
+            c.u8[i] = b.u8[i];
+        } else {
+            c.u8[i] = a.u8[i];
+        }
+    }
+    return c;
+}
+
+/// vector of 8 unsigned 32-bit integers
+struct simd8uint32 : simd256bit {
+    simd8uint32() {}
+
+    explicit simd8uint32(uint32_t x) {
+        set1(x);
+    }
+
+    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, fmt, u32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%08x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%10d,");
+    }
+
+    void set1(uint32_t x) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] = x;
+        }
+    }
+};
+
+struct simd8float32 : simd256bit {
+    simd8float32() {}
+
+    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8float32(float x) {
+        set1(x);
+    }
+
+    explicit simd8float32(const float* x) {
+        loadu((void*)x);
+    }
+
+    void set1(float x) {
+        for (int i = 0; i < 8; i++) {
+            f32[i] = x;
+        }
+    }
+
+    template <typename F>
+    static simd8float32 binary_func(
+            const simd8float32& a,
+            const simd8float32& b,
+            F&& f) {
+        simd8float32 c;
+        for (int j = 0; j < 8; j++) {
+            c.f32[j] = f(a.f32[j], b.f32[j]);
+        }
+        return c;
+    }
+
+    simd8float32 operator*(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a * b; });
+    }
+
+    simd8float32 operator+(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a + b; });
+    }
+
+    simd8float32 operator-(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a - b; });
+    }
+
+    std::string tostring() const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, "%g,", f32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+
+// hadd does not cross lanes
+inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0] + a.f32[1];
+    c.f32[1] = a.f32[2] + a.f32[3];
+    c.f32[2] = b.f32[0] + b.f32[1];
+    c.f32[3] = b.f32[2] + b.f32[3];
+
+    c.f32[4] = a.f32[4] + a.f32[5];
+    c.f32[5] = a.f32[6] + a.f32[7];
+    c.f32[6] = b.f32[4] + b.f32[5];
+    c.f32[7] = b.f32[6] + b.f32[7];
+
+    return c;
+}
+
+inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = b.f32[0];
+    c.f32[2] = a.f32[1];
+    c.f32[3] = b.f32[1];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = b.f32[4];
+    c.f32[6] = a.f32[5];
+    c.f32[7] = b.f32[5];
+
+    return c;
+}
+
+inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[2];
+    c.f32[1] = b.f32[2];
+    c.f32[2] = a.f32[3];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[6];
+    c.f32[5] = b.f32[6];
+    c.f32[6] = a.f32[7];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// compute a * b + c
+inline simd8float32 fmadd(
+        const simd8float32& a,
+        const simd8float32& b,
+        const simd8float32& c) {
+    simd8float32 res;
+    for (int i = 0; i < 8; i++) {
+        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
+    }
+    return res;
+}
+
+namespace {
+
+// get even float32's of a and b, interleaved
+simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[2];
+    c.f32[2] = b.f32[0];
+    c.f32[3] = b.f32[2];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = a.f32[6];
+    c.f32[6] = b.f32[4];
+    c.f32[7] = b.f32[6];
+
+    return c;
+}
+
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[1];
+    c.f32[1] = a.f32[3];
+    c.f32[2] = b.f32[1];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[5];
+    c.f32[5] = a.f32[7];
+    c.f32[6] = b.f32[5];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[1];
+    c.f32[2] = a.f32[2];
+    c.f32[3] = a.f32[3];
+
+    c.f32[4] = b.f32[0];
+    c.f32[5] = b.f32[1];
+    c.f32[6] = b.f32[2];
+    c.f32[7] = b.f32[3];
+
+    return c;
+}
+
+simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[4];
+    c.f32[1] = a.f32[5];
+    c.f32[2] = a.f32[6];
+    c.f32[3] = a.f32[7];
+
+    c.f32[4] = b.f32[4];
+    c.f32[5] = b.f32[5];
+    c.f32[6] = b.f32[6];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+} // namespace
+
+} // namespace faiss
--- a/Show More
+++ b/Show More