success

2024-10-09 16:13:22 +00:00
commit 0ea3f048dc
437 changed files with 44406 additions and 0 deletions
--- a/baselines/grasping/GSNet/knn/build/lib.linux-x86_64-cpython-39/knn_pytorch/knn_pytorch.cpython-39-x86_64-linux-gnu.so
+++ b/baselines/grasping/GSNet/knn/build/lib.linux-x86_64-cpython-39/knn_pytorch/knn_pytorch.cpython-39-x86_64-linux-gnu.so
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_log
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_log
@@ -0,0 +1,4 @@
+# ninja log v5
+16	775	1714644648144395900	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o	21a2df11b6193e6c
+12	11800	1714644659159670600	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o	da04abe8d79e7b32
+20	12187	1714644659348420100	/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o	60d711705a1d5d08
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja
@@ -0,0 +1,35 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda-11.8/bin/nvcc
+
+cflags = -pthread -B /home/hofee/miniconda3/envs/gsnet/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -I/home/hofee/miniconda3/envs/gsnet/include -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -fPIC -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
+cuda_cflags = -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 -std=c++17
+cuda_dlink_post_cflags = 
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+
+
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.cpp
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.cu
+build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.cpp
+
+
+
+
+
+
+
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.o
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.o
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cuda/knn.o
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/cuda/knn.o
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/vision.o
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/home/data/hofee/project/ActivePerception/ActivePerception/baselines/grasping/GSNet/knn/src/vision.o
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o
--- a/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o
+++ b/baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o
--- a/baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg
+++ b/baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg
--- a/baselines/grasping/GSNet/knn/knn_modules.py
+++ b/baselines/grasping/GSNet/knn/knn_modules.py
@@ -0,0 +1,17 @@
+import unittest
+import gc
+import operator as op
+import functools
+import torch
+from torch.autograd import Variable, Function
+from knn_pytorch import knn_pytorch
+# import knn_pytorch
+def knn(ref, query, k=1):
+  """ Compute k nearest neighbors for each query point.
+  """
+  device = ref.device
+  ref = ref.float().to(device)
+  query = query.float().to(device)
+  inds = torch.empty(query.shape[0], k, query.shape[2]).long().to(device)
+  knn_pytorch.knn(ref, query, inds)
+  return inds
--- a/baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO
+++ b/baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO
@@ -0,0 +1,6 @@
+Metadata-Version: 2.1
+Name: knn-pytorch
+Version: 0.1
+Summary: KNN implement in Pytorch 1.0 including both cpu version and gpu version
+Home-page: https://github.com/foolyc/torchKNN
+Author: foolyc
--- a/baselines/grasping/GSNet/knn/setup.py
+++ b/baselines/grasping/GSNet/knn/setup.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import glob
+import os
+
+import torch
+from setuptools import find_packages
+from setuptools import setup
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "knn_pytorch.knn_pytorch",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="knn_pytorch",
+    version="0.1",
+    author="foolyc",
+    url="https://github.com/foolyc/torchKNN",
+    description="KNN implement in Pytorch 1.0 including both cpu version and gpu version",
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
--- a/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp
+++ b/baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp
@@ -0,0 +1,56 @@
+#include "cpu/vision.h"
+
+
+void knn_cpu(float* ref_dev, int ref_width, float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, long* ind_buf)
+{
+    // Compute all the distances
+    for(int query_idx = 0;query_idx<query_width;query_idx++)
+    {
+        for(int ref_idx = 0;ref_idx < ref_width;ref_idx++)
+        {
+            dist_dev[query_idx * ref_width + ref_idx] = 0;
+            for(int hi=0;hi<height;hi++)
+                dist_dev[query_idx * ref_width + ref_idx] += (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]) * (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]);
+        }
+    }
+
+    float temp_value;
+    long temp_idx;
+    // sort the distance and get the index
+    for(int query_idx = 0;query_idx<query_width;query_idx++)
+    {
+        for(int i = 0;i < ref_width;i++)
+        {
+            ind_buf[i] = i+1;
+        }
+        for(int i = 0;i < ref_width;i++)
+            for(int j = 0;j < ref_width - i - 1;j++)
+            {
+                if(dist_dev[query_idx * ref_width + j] > dist_dev[query_idx * ref_width + j + 1])
+                {
+                    temp_value = dist_dev[query_idx * ref_width + j];
+                    dist_dev[query_idx * ref_width + j] = dist_dev[query_idx * ref_width + j + 1];
+                    dist_dev[query_idx * ref_width + j + 1] = temp_value;
+                    temp_idx = ind_buf[j];
+                    ind_buf[j] = ind_buf[j + 1];
+                    ind_buf[j + 1] = temp_idx;
+                }
+
+            }
+
+        for(int i = 0;i < k;i++)
+            ind_dev[query_idx + i * query_width] = ind_buf[i];
+        #if DEBUG
+        for(int i = 0;i < ref_width;i++)
+            printf("%d, ", ind_buf[i]);
+        printf("\n");
+        #endif
+
+    }
+
+
+
+
+
+}
--- a/baselines/grasping/GSNet/knn/src/cpu/vision.h
+++ b/baselines/grasping/GSNet/knn/src/cpu/vision.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/extension.h>
+
+void knn_cpu(float* ref_dev, int ref_width,
+    float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, long* ind_buf);
--- a/baselines/grasping/GSNet/knn/src/cuda/knn.cu
+++ b/baselines/grasping/GSNet/knn/src/cuda/knn.cu
@@ -0,0 +1,269 @@
+/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
+ * The modifications are
+ *      removed texture memory usage
+ *      removed split query KNN computation
+ *      added feature extraction with bilinear interpolation
+ *
+ * Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
+ */
+
+// Includes
+#include <cstdio>
+#include "cuda.h"
+
+#define IDX2D(i, j, dj) (dj * i + j)
+#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
+
+#define BLOCK 512
+#define MAX_STREAMS 512
+
+// Constants used by the program
+#define BLOCK_DIM                      16
+#define DEBUG                          0
+
+
+/**
+  * Computes the distance between two matrix A (reference points) and
+  * B (query points) containing respectively wA and wB points.
+  *
+  * @param A     pointer on the matrix A
+  * @param wA    width of the matrix A = number of points in A
+  * @param B     pointer on the matrix B
+  * @param wB    width of the matrix B = number of points in B
+  * @param dim   dimension of points = height of matrices A and B
+  * @param AB    pointer on the matrix containing the wA*wB distances computed
+  */
+__global__ void cuComputeDistanceGlobal( float* A, int wA,
+    float* B, int wB, int dim, float* AB){
+
+// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
+__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
+__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
+
+
+  // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
+  __shared__ int begin_A;
+  __shared__ int begin_B;
+  __shared__ int step_A;
+  __shared__ int step_B;
+  __shared__ int end_A;
+
+  // Thread index
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  // Other variables
+  float tmp;
+  float ssd = 0;
+
+  // Loop parameters
+  begin_A = BLOCK_DIM * blockIdx.y;
+  begin_B = BLOCK_DIM * blockIdx.x;
+  step_A  = BLOCK_DIM * wA;
+  step_B  = BLOCK_DIM * wB;
+  end_A   = begin_A + (dim-1) * wA;
+
+    // Conditions
+  int cond0 = (begin_A + tx < wA); // used to write in shared memory
+  int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix
+  int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix
+
+  // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
+  for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
+    // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
+    if (a/wA + ty < dim){
+      shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
+      shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
+    }
+    else{
+      shared_A[ty][tx] = 0;
+      shared_B[ty][tx] = 0;
+    }
+
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+
+    // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
+    if (cond2 && cond1){
+      for (int k = 0; k < BLOCK_DIM; ++k){
+        tmp = shared_A[k][ty] - shared_B[k][tx];
+        ssd += tmp*tmp;
+      }
+    }
+
+    // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory; each thread writes one element
+  if (cond2 && cond1)
+    AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
+}
+
+
+/**
+  * Gathers k-th smallest distances for each column of the distance matrix in the top.
+  *
+  * @param dist        distance matrix
+  * @param ind         index matrix
+  * @param width       width of the distance matrix and of the index matrix
+  * @param height      height of the distance matrix and of the index matrix
+  * @param k           number of neighbors to consider
+  */
+__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
+
+  // Variables
+  int l, i, j;
+  float *p_dist;
+  long  *p_ind;
+  float curr_dist, max_dist;
+  long  curr_row,  max_row;
+  unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (xIndex<width){
+    // Pointer shift, initialization, and max value
+    p_dist   = dist + xIndex;
+    p_ind    = ind  + xIndex;
+    max_dist = p_dist[0];
+    p_ind[0] = 1;
+
+    // Part 1 : sort kth firt elementZ
+    for (l=1; l<k; l++){
+      curr_row  = l * width;
+      curr_dist = p_dist[curr_row];
+      if (curr_dist<max_dist){
+        i=l-1;
+        for (int a=0; a<l-1; a++){
+          if (p_dist[a*width]>curr_dist){
+            i=a;
+            break;
+          }
+        }
+        for (j=l; j>i; j--){
+          p_dist[j*width] = p_dist[(j-1)*width];
+          p_ind[j*width]   = p_ind[(j-1)*width];
+        }
+        p_dist[i*width] = curr_dist;
+        p_ind[i*width]   = l+1;
+      } else {
+        p_ind[l*width] = l+1;
+      }
+      max_dist = p_dist[curr_row];
+    }
+
+    // Part 2 : insert element in the k-th first lines
+    max_row = (k-1)*width;
+    for (l=k; l<height; l++){
+      curr_dist = p_dist[l*width];
+      if (curr_dist<max_dist){
+        i=k-1;
+        for (int a=0; a<k-1; a++){
+          if (p_dist[a*width]>curr_dist){
+            i=a;
+            break;
+          }
+        }
+        for (j=k-1; j>i; j--){
+          p_dist[j*width] = p_dist[(j-1)*width];
+          p_ind[j*width]   = p_ind[(j-1)*width];
+        }
+        p_dist[i*width] = curr_dist;
+        p_ind[i*width]   = l+1;
+        max_dist             = p_dist[max_row];
+      }
+    }
+  }
+}
+
+
+/**
+  * Computes the square root of the first line (width-th first element)
+  * of the distance matrix.
+  *
+  * @param dist    distance matrix
+  * @param width   width of the distance matrix
+  * @param k       number of neighbors to consider
+  */
+__global__ void cuParallelSqrt(float *dist, int width, int k){
+    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
+  if (xIndex<width && yIndex<k)
+    dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
+}
+
+
+//-----------------------------------------------------------------------------------------------//
+//                                   K-th NEAREST NEIGHBORS                                      //
+//-----------------------------------------------------------------------------------------------//
+
+/**
+  * K nearest neighbor algorithm
+  * - Initialize CUDA
+  * - Allocate device memory
+  * - Copy point sets (reference and query points) from host to device memory
+  * - Compute the distances + indexes to the k nearest neighbors for each query point
+  * - Copy distances from device to host memory
+  *
+  * @param ref_host      reference points ; pointer to linear matrix
+  * @param ref_nb        number of reference points ; width of the matrix
+  * @param query_host    query points ; pointer to linear matrix
+  * @param query_nb      number of query points ; width of the matrix
+  * @param dim           dimension of points ; height of the matrices
+  * @param k             number of neighbor to consider
+  * @param dist_host     distances to k nearest neighbors ; pointer to linear matrix
+  * @param dist_host     indexes of the k nearest neighbors ; pointer to linear matrix
+  *
+  */
+void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
+    int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
+
+  // Grids and threads
+  dim3 g_16x16(query_nb/16, ref_nb/16, 1);
+  dim3 t_16x16(16, 16, 1);
+  if (query_nb%16 != 0) g_16x16.x += 1;
+  if (ref_nb  %16 != 0) g_16x16.y += 1;
+  //
+  dim3 g_256x1(query_nb/256, 1, 1);
+  dim3 t_256x1(256, 1, 1);
+  if (query_nb%256 != 0) g_256x1.x += 1;
+
+  dim3 g_k_16x16(query_nb/16, k/16, 1);
+  dim3 t_k_16x16(16, 16, 1);
+  if (query_nb%16 != 0) g_k_16x16.x += 1;
+  if (k  %16 != 0) g_k_16x16.y += 1;
+
+  // Kernel 1: Compute all the distances
+  cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb, query_dev, query_nb, dim, dist_dev);
+
+  // Kernel 2: Sort each column
+  cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev, query_nb, ref_nb, k);
+
+  // Kernel 3: Compute square root of k first elements
+  // cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k);
+
+#if DEBUG
+  unsigned int  size_of_float = sizeof(float);
+  unsigned long size_of_long  = sizeof(long);
+
+  float* dist_host = new float[query_nb * k];
+  long*  idx_host  = new long[query_nb * k];
+
+  // Memory copy of output from device to host
+  cudaMemcpy(&dist_host[0], dist_dev,
+      query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(&idx_host[0], ind_dev,
+      query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
+
+  int i = 0;
+  for(i = 0; i < 100; i++){
+    printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
+  }
+#endif
+}
+
+
+
+
+
+
--- a/baselines/grasping/GSNet/knn/src/cuda/vision.h
+++ b/baselines/grasping/GSNet/knn/src/cuda/vision.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <torch/extension.h>
+// #include <THC/THC.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+
+void knn_device(float* ref_dev, int ref_width,
+    float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream);
--- a/baselines/grasping/GSNet/knn/src/knn.h
+++ b/baselines/grasping/GSNet/knn/src/knn.h
@@ -0,0 +1,75 @@
+#pragma once
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+// #include <THC/THC.h>
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#endif
+
+
+
+int knn(at::Tensor& ref, at::Tensor& query, at::Tensor& idx)
+{
+
+    // TODO check dimensions
+    long batch, ref_nb, query_nb, dim, k;
+    batch = ref.size(0);
+    dim = ref.size(1);
+    k = idx.size(1);
+    ref_nb = ref.size(2);
+    query_nb = query.size(2);
+
+    // float *ref_dev = ref.data<float>();
+    // float *query_dev = query.data<float>();
+    // long *idx_dev = idx.data<long>();
+    float *ref_dev = ref.data_ptr<float>();
+    float *query_dev = query.data_ptr<float>();
+    long *idx_dev = idx.data_ptr<long>();
+
+
+
+  // if (ref.type().is_cuda()) {
+  if (ref.is_cuda()) {
+#ifdef WITH_CUDA
+    // TODO raise error if not compiled with CUDA
+    // float *dist_dev = (float*)THCudaMalloc(state, ref_nb * query_nb * sizeof(float));
+    float *dist_dev = (float*)c10::cuda::CUDACachingAllocator::raw_alloc(ref_nb * query_nb * sizeof(float));
+
+    for (int b = 0; b < batch; b++)
+    {
+    // knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+    //   dist_dev, idx_dev + b * k * query_nb, THCState_getCurrentStream(state));
+      knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+      dist_dev, idx_dev + b * k * query_nb, c10::cuda::getCurrentCUDAStream());
+    }
+    // THCudaFree(state, dist_dev);
+    c10::cuda::CUDACachingAllocator::raw_delete(dist_dev);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("error in knn: %s\n", cudaGetErrorString(err));
+        // THError("aborting");
+    }
+    return 1;
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+
+    float *dist_dev = (float*)malloc(ref_nb * query_nb * sizeof(float));
+    long *ind_buf = (long*)malloc(ref_nb * sizeof(long));
+    for (int b = 0; b < batch; b++) {
+    knn_cpu(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+      dist_dev, idx_dev + b * k * query_nb, ind_buf);
+    }
+
+    free(dist_dev);
+    free(ind_buf);
+
+    return 1;
+
+}
--- a/baselines/grasping/GSNet/knn/src/vision.cpp
+++ b/baselines/grasping/GSNet/knn/src/vision.cpp
@@ -0,0 +1,5 @@
+#include "knn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("knn", &knn, "k-nearest neighbors");
+}