success
This commit is contained in:
Binary file not shown.
BIN
baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps
Executable file
BIN
baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/.ninja_deps
Executable file
Binary file not shown.
@@ -0,0 +1,4 @@
|
||||
# ninja log v5
|
||||
16 775 1714644648144395900 /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o 21a2df11b6193e6c
|
||||
12 11800 1714644659159670600 /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o da04abe8d79e7b32
|
||||
20 12187 1714644659348420100 /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o 60d711705a1d5d08
|
35
baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja
Executable file
35
baselines/grasping/GSNet/knn/build/temp.linux-x86_64-cpython-39/build.ninja
Executable file
@@ -0,0 +1,35 @@
|
||||
ninja_required_version = 1.3
|
||||
cxx = c++
|
||||
nvcc = /usr/local/cuda-11.8/bin/nvcc
|
||||
|
||||
cflags = -pthread -B /home/hofee/miniconda3/envs/gsnet/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -I/home/hofee/miniconda3/envs/gsnet/include -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -fPIC -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
|
||||
post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
|
||||
cuda_cflags = -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
|
||||
cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 -std=c++17
|
||||
cuda_dlink_post_cflags =
|
||||
ldflags =
|
||||
|
||||
rule compile
|
||||
command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
|
||||
depfile = $out.d
|
||||
deps = gcc
|
||||
|
||||
rule cuda_compile
|
||||
depfile = $out.d
|
||||
deps = gcc
|
||||
command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.cpp
|
||||
build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.cu
|
||||
build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.cpp
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg
vendored
Executable file
BIN
baselines/grasping/GSNet/knn/dist/knn_pytorch-0.1-py3.9-linux-x86_64.egg
vendored
Executable file
Binary file not shown.
17
baselines/grasping/GSNet/knn/knn_modules.py
Executable file
17
baselines/grasping/GSNet/knn/knn_modules.py
Executable file
@@ -0,0 +1,17 @@
|
||||
import unittest
|
||||
import gc
|
||||
import operator as op
|
||||
import functools
|
||||
import torch
|
||||
from torch.autograd import Variable, Function
|
||||
from knn_pytorch import knn_pytorch
|
||||
# import knn_pytorch
|
||||
def knn(ref, query, k=1):
|
||||
""" Compute k nearest neighbors for each query point.
|
||||
"""
|
||||
device = ref.device
|
||||
ref = ref.float().to(device)
|
||||
query = query.float().to(device)
|
||||
inds = torch.empty(query.shape[0], k, query.shape[2]).long().to(device)
|
||||
knn_pytorch.knn(ref, query, inds)
|
||||
return inds
|
6
baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO
Executable file
6
baselines/grasping/GSNet/knn/knn_pytorch.egg-info/PKG-INFO
Executable file
@@ -0,0 +1,6 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: knn-pytorch
|
||||
Version: 0.1
|
||||
Summary: KNN implement in Pytorch 1.0 including both cpu version and gpu version
|
||||
Home-page: https://github.com/foolyc/torchKNN
|
||||
Author: foolyc
|
66
baselines/grasping/GSNet/knn/setup.py
Executable file
66
baselines/grasping/GSNet/knn/setup.py
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
import torch
|
||||
from setuptools import find_packages
|
||||
from setuptools import setup
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
from torch.utils.cpp_extension import CppExtension
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
|
||||
requirements = ["torch", "torchvision"]
|
||||
|
||||
|
||||
def get_extensions():
|
||||
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
extensions_dir = os.path.join(this_dir, "src")
|
||||
|
||||
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
|
||||
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
|
||||
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
|
||||
|
||||
sources = main_file + source_cpu
|
||||
extension = CppExtension
|
||||
|
||||
extra_compile_args = {"cxx": []}
|
||||
define_macros = []
|
||||
|
||||
if torch.cuda.is_available() and CUDA_HOME is not None:
|
||||
extension = CUDAExtension
|
||||
sources += source_cuda
|
||||
define_macros += [("WITH_CUDA", None)]
|
||||
extra_compile_args["nvcc"] = [
|
||||
"-DCUDA_HAS_FP16=1",
|
||||
"-D__CUDA_NO_HALF_OPERATORS__",
|
||||
"-D__CUDA_NO_HALF_CONVERSIONS__",
|
||||
"-D__CUDA_NO_HALF2_OPERATORS__",
|
||||
]
|
||||
|
||||
sources = [os.path.join(extensions_dir, s) for s in sources]
|
||||
|
||||
include_dirs = [extensions_dir]
|
||||
|
||||
ext_modules = [
|
||||
extension(
|
||||
"knn_pytorch.knn_pytorch",
|
||||
sources,
|
||||
include_dirs=include_dirs,
|
||||
define_macros=define_macros,
|
||||
extra_compile_args=extra_compile_args,
|
||||
)
|
||||
]
|
||||
|
||||
return ext_modules
|
||||
|
||||
|
||||
setup(
|
||||
name="knn_pytorch",
|
||||
version="0.1",
|
||||
author="foolyc",
|
||||
url="https://github.com/foolyc/torchKNN",
|
||||
description="KNN implement in Pytorch 1.0 including both cpu version and gpu version",
|
||||
ext_modules=get_extensions(),
|
||||
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
|
||||
)
|
56
baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp
Executable file
56
baselines/grasping/GSNet/knn/src/cpu/knn_cpu.cpp
Executable file
@@ -0,0 +1,56 @@
|
||||
#include "cpu/vision.h"
|
||||
|
||||
|
||||
void knn_cpu(float* ref_dev, int ref_width, float* query_dev, int query_width,
|
||||
int height, int k, float* dist_dev, long* ind_dev, long* ind_buf)
|
||||
{
|
||||
// Compute all the distances
|
||||
for(int query_idx = 0;query_idx<query_width;query_idx++)
|
||||
{
|
||||
for(int ref_idx = 0;ref_idx < ref_width;ref_idx++)
|
||||
{
|
||||
dist_dev[query_idx * ref_width + ref_idx] = 0;
|
||||
for(int hi=0;hi<height;hi++)
|
||||
dist_dev[query_idx * ref_width + ref_idx] += (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]) * (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
float temp_value;
|
||||
long temp_idx;
|
||||
// sort the distance and get the index
|
||||
for(int query_idx = 0;query_idx<query_width;query_idx++)
|
||||
{
|
||||
for(int i = 0;i < ref_width;i++)
|
||||
{
|
||||
ind_buf[i] = i+1;
|
||||
}
|
||||
for(int i = 0;i < ref_width;i++)
|
||||
for(int j = 0;j < ref_width - i - 1;j++)
|
||||
{
|
||||
if(dist_dev[query_idx * ref_width + j] > dist_dev[query_idx * ref_width + j + 1])
|
||||
{
|
||||
temp_value = dist_dev[query_idx * ref_width + j];
|
||||
dist_dev[query_idx * ref_width + j] = dist_dev[query_idx * ref_width + j + 1];
|
||||
dist_dev[query_idx * ref_width + j + 1] = temp_value;
|
||||
temp_idx = ind_buf[j];
|
||||
ind_buf[j] = ind_buf[j + 1];
|
||||
ind_buf[j + 1] = temp_idx;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for(int i = 0;i < k;i++)
|
||||
ind_dev[query_idx + i * query_width] = ind_buf[i];
|
||||
#if DEBUG
|
||||
for(int i = 0;i < ref_width;i++)
|
||||
printf("%d, ", ind_buf[i]);
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
6
baselines/grasping/GSNet/knn/src/cpu/vision.h
Executable file
6
baselines/grasping/GSNet/knn/src/cpu/vision.h
Executable file
@@ -0,0 +1,6 @@
|
||||
#pragma once
|
||||
#include <torch/extension.h>
|
||||
|
||||
void knn_cpu(float* ref_dev, int ref_width,
|
||||
float* query_dev, int query_width,
|
||||
int height, int k, float* dist_dev, long* ind_dev, long* ind_buf);
|
269
baselines/grasping/GSNet/knn/src/cuda/knn.cu
Executable file
269
baselines/grasping/GSNet/knn/src/cuda/knn.cu
Executable file
@@ -0,0 +1,269 @@
|
||||
/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
|
||||
* The modifications are
|
||||
* removed texture memory usage
|
||||
* removed split query KNN computation
|
||||
* added feature extraction with bilinear interpolation
|
||||
*
|
||||
* Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
|
||||
*/
|
||||
|
||||
// Includes
|
||||
#include <cstdio>
|
||||
#include "cuda.h"
|
||||
|
||||
#define IDX2D(i, j, dj) (dj * i + j)
|
||||
#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
|
||||
|
||||
#define BLOCK 512
|
||||
#define MAX_STREAMS 512
|
||||
|
||||
// Constants used by the program
|
||||
#define BLOCK_DIM 16
|
||||
#define DEBUG 0
|
||||
|
||||
|
||||
/**
|
||||
* Computes the distance between two matrix A (reference points) and
|
||||
* B (query points) containing respectively wA and wB points.
|
||||
*
|
||||
* @param A pointer on the matrix A
|
||||
* @param wA width of the matrix A = number of points in A
|
||||
* @param B pointer on the matrix B
|
||||
* @param wB width of the matrix B = number of points in B
|
||||
* @param dim dimension of points = height of matrices A and B
|
||||
* @param AB pointer on the matrix containing the wA*wB distances computed
|
||||
*/
|
||||
__global__ void cuComputeDistanceGlobal( float* A, int wA,
|
||||
float* B, int wB, int dim, float* AB){
|
||||
|
||||
// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
|
||||
__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
|
||||
__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
|
||||
|
||||
|
||||
// Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
|
||||
__shared__ int begin_A;
|
||||
__shared__ int begin_B;
|
||||
__shared__ int step_A;
|
||||
__shared__ int step_B;
|
||||
__shared__ int end_A;
|
||||
|
||||
// Thread index
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
// Other variables
|
||||
float tmp;
|
||||
float ssd = 0;
|
||||
|
||||
// Loop parameters
|
||||
begin_A = BLOCK_DIM * blockIdx.y;
|
||||
begin_B = BLOCK_DIM * blockIdx.x;
|
||||
step_A = BLOCK_DIM * wA;
|
||||
step_B = BLOCK_DIM * wB;
|
||||
end_A = begin_A + (dim-1) * wA;
|
||||
|
||||
// Conditions
|
||||
int cond0 = (begin_A + tx < wA); // used to write in shared memory
|
||||
int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix
|
||||
int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix
|
||||
|
||||
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
|
||||
for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
|
||||
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
|
||||
if (a/wA + ty < dim){
|
||||
shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
|
||||
shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
|
||||
}
|
||||
else{
|
||||
shared_A[ty][tx] = 0;
|
||||
shared_B[ty][tx] = 0;
|
||||
}
|
||||
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
__syncthreads();
|
||||
|
||||
// Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
|
||||
if (cond2 && cond1){
|
||||
for (int k = 0; k < BLOCK_DIM; ++k){
|
||||
tmp = shared_A[k][ty] - shared_B[k][tx];
|
||||
ssd += tmp*tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Write the block sub-matrix to device memory; each thread writes one element
|
||||
if (cond2 && cond1)
|
||||
AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gathers k-th smallest distances for each column of the distance matrix in the top.
|
||||
*
|
||||
* @param dist distance matrix
|
||||
* @param ind index matrix
|
||||
* @param width width of the distance matrix and of the index matrix
|
||||
* @param height height of the distance matrix and of the index matrix
|
||||
* @param k number of neighbors to consider
|
||||
*/
|
||||
__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
|
||||
|
||||
// Variables
|
||||
int l, i, j;
|
||||
float *p_dist;
|
||||
long *p_ind;
|
||||
float curr_dist, max_dist;
|
||||
long curr_row, max_row;
|
||||
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (xIndex<width){
|
||||
// Pointer shift, initialization, and max value
|
||||
p_dist = dist + xIndex;
|
||||
p_ind = ind + xIndex;
|
||||
max_dist = p_dist[0];
|
||||
p_ind[0] = 1;
|
||||
|
||||
// Part 1 : sort kth firt elementZ
|
||||
for (l=1; l<k; l++){
|
||||
curr_row = l * width;
|
||||
curr_dist = p_dist[curr_row];
|
||||
if (curr_dist<max_dist){
|
||||
i=l-1;
|
||||
for (int a=0; a<l-1; a++){
|
||||
if (p_dist[a*width]>curr_dist){
|
||||
i=a;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (j=l; j>i; j--){
|
||||
p_dist[j*width] = p_dist[(j-1)*width];
|
||||
p_ind[j*width] = p_ind[(j-1)*width];
|
||||
}
|
||||
p_dist[i*width] = curr_dist;
|
||||
p_ind[i*width] = l+1;
|
||||
} else {
|
||||
p_ind[l*width] = l+1;
|
||||
}
|
||||
max_dist = p_dist[curr_row];
|
||||
}
|
||||
|
||||
// Part 2 : insert element in the k-th first lines
|
||||
max_row = (k-1)*width;
|
||||
for (l=k; l<height; l++){
|
||||
curr_dist = p_dist[l*width];
|
||||
if (curr_dist<max_dist){
|
||||
i=k-1;
|
||||
for (int a=0; a<k-1; a++){
|
||||
if (p_dist[a*width]>curr_dist){
|
||||
i=a;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (j=k-1; j>i; j--){
|
||||
p_dist[j*width] = p_dist[(j-1)*width];
|
||||
p_ind[j*width] = p_ind[(j-1)*width];
|
||||
}
|
||||
p_dist[i*width] = curr_dist;
|
||||
p_ind[i*width] = l+1;
|
||||
max_dist = p_dist[max_row];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes the square root of the first line (width-th first element)
|
||||
* of the distance matrix.
|
||||
*
|
||||
* @param dist distance matrix
|
||||
* @param width width of the distance matrix
|
||||
* @param k number of neighbors to consider
|
||||
*/
|
||||
__global__ void cuParallelSqrt(float *dist, int width, int k){
|
||||
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (xIndex<width && yIndex<k)
|
||||
dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------------------//
|
||||
// K-th NEAREST NEIGHBORS //
|
||||
//-----------------------------------------------------------------------------------------------//
|
||||
|
||||
/**
|
||||
* K nearest neighbor algorithm
|
||||
* - Initialize CUDA
|
||||
* - Allocate device memory
|
||||
* - Copy point sets (reference and query points) from host to device memory
|
||||
* - Compute the distances + indexes to the k nearest neighbors for each query point
|
||||
* - Copy distances from device to host memory
|
||||
*
|
||||
* @param ref_host reference points ; pointer to linear matrix
|
||||
* @param ref_nb number of reference points ; width of the matrix
|
||||
* @param query_host query points ; pointer to linear matrix
|
||||
* @param query_nb number of query points ; width of the matrix
|
||||
* @param dim dimension of points ; height of the matrices
|
||||
* @param k number of neighbor to consider
|
||||
* @param dist_host distances to k nearest neighbors ; pointer to linear matrix
|
||||
* @param dist_host indexes of the k nearest neighbors ; pointer to linear matrix
|
||||
*
|
||||
*/
|
||||
void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
|
||||
int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
|
||||
|
||||
// Grids and threads
|
||||
dim3 g_16x16(query_nb/16, ref_nb/16, 1);
|
||||
dim3 t_16x16(16, 16, 1);
|
||||
if (query_nb%16 != 0) g_16x16.x += 1;
|
||||
if (ref_nb %16 != 0) g_16x16.y += 1;
|
||||
//
|
||||
dim3 g_256x1(query_nb/256, 1, 1);
|
||||
dim3 t_256x1(256, 1, 1);
|
||||
if (query_nb%256 != 0) g_256x1.x += 1;
|
||||
|
||||
dim3 g_k_16x16(query_nb/16, k/16, 1);
|
||||
dim3 t_k_16x16(16, 16, 1);
|
||||
if (query_nb%16 != 0) g_k_16x16.x += 1;
|
||||
if (k %16 != 0) g_k_16x16.y += 1;
|
||||
|
||||
// Kernel 1: Compute all the distances
|
||||
cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb, query_dev, query_nb, dim, dist_dev);
|
||||
|
||||
// Kernel 2: Sort each column
|
||||
cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev, query_nb, ref_nb, k);
|
||||
|
||||
// Kernel 3: Compute square root of k first elements
|
||||
// cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k);
|
||||
|
||||
#if DEBUG
|
||||
unsigned int size_of_float = sizeof(float);
|
||||
unsigned long size_of_long = sizeof(long);
|
||||
|
||||
float* dist_host = new float[query_nb * k];
|
||||
long* idx_host = new long[query_nb * k];
|
||||
|
||||
// Memory copy of output from device to host
|
||||
cudaMemcpy(&dist_host[0], dist_dev,
|
||||
query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaMemcpy(&idx_host[0], ind_dev,
|
||||
query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
|
||||
|
||||
int i = 0;
|
||||
for(i = 0; i < 100; i++){
|
||||
printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
9
baselines/grasping/GSNet/knn/src/cuda/vision.h
Executable file
9
baselines/grasping/GSNet/knn/src/cuda/vision.h
Executable file
@@ -0,0 +1,9 @@
|
||||
#pragma once
|
||||
#include <torch/extension.h>
|
||||
// #include <THC/THC.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
|
||||
void knn_device(float* ref_dev, int ref_width,
|
||||
float* query_dev, int query_width,
|
||||
int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream);
|
75
baselines/grasping/GSNet/knn/src/knn.h
Executable file
75
baselines/grasping/GSNet/knn/src/knn.h
Executable file
@@ -0,0 +1,75 @@
|
||||
#pragma once
|
||||
#include "cpu/vision.h"
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
#include "cuda/vision.h"
|
||||
// #include <THC/THC.h>
|
||||
// extern THCState *state;
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int knn(at::Tensor& ref, at::Tensor& query, at::Tensor& idx)
|
||||
{
|
||||
|
||||
// TODO check dimensions
|
||||
long batch, ref_nb, query_nb, dim, k;
|
||||
batch = ref.size(0);
|
||||
dim = ref.size(1);
|
||||
k = idx.size(1);
|
||||
ref_nb = ref.size(2);
|
||||
query_nb = query.size(2);
|
||||
|
||||
// float *ref_dev = ref.data<float>();
|
||||
// float *query_dev = query.data<float>();
|
||||
// long *idx_dev = idx.data<long>();
|
||||
float *ref_dev = ref.data_ptr<float>();
|
||||
float *query_dev = query.data_ptr<float>();
|
||||
long *idx_dev = idx.data_ptr<long>();
|
||||
|
||||
|
||||
|
||||
// if (ref.type().is_cuda()) {
|
||||
if (ref.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
// TODO raise error if not compiled with CUDA
|
||||
// float *dist_dev = (float*)THCudaMalloc(state, ref_nb * query_nb * sizeof(float));
|
||||
float *dist_dev = (float*)c10::cuda::CUDACachingAllocator::raw_alloc(ref_nb * query_nb * sizeof(float));
|
||||
|
||||
for (int b = 0; b < batch; b++)
|
||||
{
|
||||
// knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
|
||||
// dist_dev, idx_dev + b * k * query_nb, THCState_getCurrentStream(state));
|
||||
knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
|
||||
dist_dev, idx_dev + b * k * query_nb, c10::cuda::getCurrentCUDAStream());
|
||||
}
|
||||
// THCudaFree(state, dist_dev);
|
||||
c10::cuda::CUDACachingAllocator::raw_delete(dist_dev);
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
printf("error in knn: %s\n", cudaGetErrorString(err));
|
||||
// THError("aborting");
|
||||
}
|
||||
return 1;
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float *dist_dev = (float*)malloc(ref_nb * query_nb * sizeof(float));
|
||||
long *ind_buf = (long*)malloc(ref_nb * sizeof(long));
|
||||
for (int b = 0; b < batch; b++) {
|
||||
knn_cpu(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
|
||||
dist_dev, idx_dev + b * k * query_nb, ind_buf);
|
||||
}
|
||||
|
||||
free(dist_dev);
|
||||
free(ind_buf);
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
5
baselines/grasping/GSNet/knn/src/vision.cpp
Executable file
5
baselines/grasping/GSNet/knn/src/vision.cpp
Executable file
@@ -0,0 +1,5 @@
|
||||
#include "knn.h"
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("knn", &knn, "k-nearest neighbors");
|
||||
}
|
Reference in New Issue
Block a user