This commit is contained in:
2024-10-09 16:13:22 +00:00
commit 0ea3f048dc
437 changed files with 44406 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
# ninja log v5
16 775 1714644648144395900 /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o 21a2df11b6193e6c
12 11800 1714644659159670600 /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o da04abe8d79e7b32
20 12187 1714644659348420100 /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o 60d711705a1d5d08

View File

@@ -0,0 +1,35 @@
ninja_required_version = 1.3
cxx = c++
nvcc = /usr/local/cuda-11.8/bin/nvcc
cflags = -pthread -B /home/hofee/miniconda3/envs/gsnet/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -I/home/hofee/miniconda3/envs/gsnet/include -fPIC -O2 -isystem /home/hofee/miniconda3/envs/gsnet/include -fPIC -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
cuda_cflags = -DWITH_CUDA -I/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/TH -I/home/hofee/miniconda3/envs/gsnet/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.8/include -I/home/hofee/miniconda3/envs/gsnet/include/python3.9 -c
cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=knn_pytorch -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 -std=c++17
cuda_dlink_post_cflags =
ldflags =
rule compile
command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
depfile = $out.d
deps = gcc
rule cuda_compile
depfile = $out.d
deps = gcc
command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cpu/knn_cpu.cpp
build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.o: cuda_compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/cuda/knn.cu
build /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/build/temp.linux-x86_64-cpython-39/mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.o: compile /mnt/h/BaiduSyncdisk/workspace/ws_active_pose/project/GS-Net/GS-Net/knn/src/vision.cpp

Binary file not shown.

View File

@@ -0,0 +1,17 @@
import unittest
import gc
import operator as op
import functools
import torch
from torch.autograd import Variable, Function
from knn_pytorch import knn_pytorch
# import knn_pytorch
def knn(ref, query, k=1):
""" Compute k nearest neighbors for each query point.
"""
device = ref.device
ref = ref.float().to(device)
query = query.float().to(device)
inds = torch.empty(query.shape[0], k, query.shape[2]).long().to(device)
knn_pytorch.knn(ref, query, inds)
return inds

View File

@@ -0,0 +1,6 @@
Metadata-Version: 2.1
Name: knn-pytorch
Version: 0.1
Summary: KNN implement in Pytorch 1.0 including both cpu version and gpu version
Home-page: https://github.com/foolyc/torchKNN
Author: foolyc

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env python
import glob
import os
import torch
from setuptools import find_packages
from setuptools import setup
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
if torch.cuda.is_available() and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"knn_pytorch.knn_pytorch",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="knn_pytorch",
version="0.1",
author="foolyc",
url="https://github.com/foolyc/torchKNN",
description="KNN implement in Pytorch 1.0 including both cpu version and gpu version",
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)

View File

@@ -0,0 +1,56 @@
#include "cpu/vision.h"
void knn_cpu(float* ref_dev, int ref_width, float* query_dev, int query_width,
int height, int k, float* dist_dev, long* ind_dev, long* ind_buf)
{
// Compute all the distances
for(int query_idx = 0;query_idx<query_width;query_idx++)
{
for(int ref_idx = 0;ref_idx < ref_width;ref_idx++)
{
dist_dev[query_idx * ref_width + ref_idx] = 0;
for(int hi=0;hi<height;hi++)
dist_dev[query_idx * ref_width + ref_idx] += (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]) * (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]);
}
}
float temp_value;
long temp_idx;
// sort the distance and get the index
for(int query_idx = 0;query_idx<query_width;query_idx++)
{
for(int i = 0;i < ref_width;i++)
{
ind_buf[i] = i+1;
}
for(int i = 0;i < ref_width;i++)
for(int j = 0;j < ref_width - i - 1;j++)
{
if(dist_dev[query_idx * ref_width + j] > dist_dev[query_idx * ref_width + j + 1])
{
temp_value = dist_dev[query_idx * ref_width + j];
dist_dev[query_idx * ref_width + j] = dist_dev[query_idx * ref_width + j + 1];
dist_dev[query_idx * ref_width + j + 1] = temp_value;
temp_idx = ind_buf[j];
ind_buf[j] = ind_buf[j + 1];
ind_buf[j + 1] = temp_idx;
}
}
for(int i = 0;i < k;i++)
ind_dev[query_idx + i * query_width] = ind_buf[i];
#if DEBUG
for(int i = 0;i < ref_width;i++)
printf("%d, ", ind_buf[i]);
printf("\n");
#endif
}
}

View File

@@ -0,0 +1,6 @@
#pragma once
#include <torch/extension.h>
void knn_cpu(float* ref_dev, int ref_width,
float* query_dev, int query_width,
int height, int k, float* dist_dev, long* ind_dev, long* ind_buf);

View File

@@ -0,0 +1,269 @@
/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
* The modifications are
* removed texture memory usage
* removed split query KNN computation
* added feature extraction with bilinear interpolation
*
* Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
*/
// Includes
#include <cstdio>
#include "cuda.h"
#define IDX2D(i, j, dj) (dj * i + j)
#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
#define BLOCK 512
#define MAX_STREAMS 512
// Constants used by the program
#define BLOCK_DIM 16
#define DEBUG 0
/**
* Computes the distance between two matrix A (reference points) and
* B (query points) containing respectively wA and wB points.
*
* @param A pointer on the matrix A
* @param wA width of the matrix A = number of points in A
* @param B pointer on the matrix B
* @param wB width of the matrix B = number of points in B
* @param dim dimension of points = height of matrices A and B
* @param AB pointer on the matrix containing the wA*wB distances computed
*/
__global__ void cuComputeDistanceGlobal( float* A, int wA,
float* B, int wB, int dim, float* AB){
// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
// Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
__shared__ int begin_A;
__shared__ int begin_B;
__shared__ int step_A;
__shared__ int step_B;
__shared__ int end_A;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Other variables
float tmp;
float ssd = 0;
// Loop parameters
begin_A = BLOCK_DIM * blockIdx.y;
begin_B = BLOCK_DIM * blockIdx.x;
step_A = BLOCK_DIM * wA;
step_B = BLOCK_DIM * wB;
end_A = begin_A + (dim-1) * wA;
// Conditions
int cond0 = (begin_A + tx < wA); // used to write in shared memory
int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix
int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
if (a/wA + ty < dim){
shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
}
else{
shared_A[ty][tx] = 0;
shared_B[ty][tx] = 0;
}
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
if (cond2 && cond1){
for (int k = 0; k < BLOCK_DIM; ++k){
tmp = shared_A[k][ty] - shared_B[k][tx];
ssd += tmp*tmp;
}
}
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory; each thread writes one element
if (cond2 && cond1)
AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
}
/**
* Gathers k-th smallest distances for each column of the distance matrix in the top.
*
* @param dist distance matrix
* @param ind index matrix
* @param width width of the distance matrix and of the index matrix
* @param height height of the distance matrix and of the index matrix
* @param k number of neighbors to consider
*/
__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
// Variables
int l, i, j;
float *p_dist;
long *p_ind;
float curr_dist, max_dist;
long curr_row, max_row;
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (xIndex<width){
// Pointer shift, initialization, and max value
p_dist = dist + xIndex;
p_ind = ind + xIndex;
max_dist = p_dist[0];
p_ind[0] = 1;
// Part 1 : sort kth firt elementZ
for (l=1; l<k; l++){
curr_row = l * width;
curr_dist = p_dist[curr_row];
if (curr_dist<max_dist){
i=l-1;
for (int a=0; a<l-1; a++){
if (p_dist[a*width]>curr_dist){
i=a;
break;
}
}
for (j=l; j>i; j--){
p_dist[j*width] = p_dist[(j-1)*width];
p_ind[j*width] = p_ind[(j-1)*width];
}
p_dist[i*width] = curr_dist;
p_ind[i*width] = l+1;
} else {
p_ind[l*width] = l+1;
}
max_dist = p_dist[curr_row];
}
// Part 2 : insert element in the k-th first lines
max_row = (k-1)*width;
for (l=k; l<height; l++){
curr_dist = p_dist[l*width];
if (curr_dist<max_dist){
i=k-1;
for (int a=0; a<k-1; a++){
if (p_dist[a*width]>curr_dist){
i=a;
break;
}
}
for (j=k-1; j>i; j--){
p_dist[j*width] = p_dist[(j-1)*width];
p_ind[j*width] = p_ind[(j-1)*width];
}
p_dist[i*width] = curr_dist;
p_ind[i*width] = l+1;
max_dist = p_dist[max_row];
}
}
}
}
/**
* Computes the square root of the first line (width-th first element)
* of the distance matrix.
*
* @param dist distance matrix
* @param width width of the distance matrix
* @param k number of neighbors to consider
*/
__global__ void cuParallelSqrt(float *dist, int width, int k){
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if (xIndex<width && yIndex<k)
dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
}
//-----------------------------------------------------------------------------------------------//
// K-th NEAREST NEIGHBORS //
//-----------------------------------------------------------------------------------------------//
/**
* K nearest neighbor algorithm
* - Initialize CUDA
* - Allocate device memory
* - Copy point sets (reference and query points) from host to device memory
* - Compute the distances + indexes to the k nearest neighbors for each query point
* - Copy distances from device to host memory
*
* @param ref_host reference points ; pointer to linear matrix
* @param ref_nb number of reference points ; width of the matrix
* @param query_host query points ; pointer to linear matrix
* @param query_nb number of query points ; width of the matrix
* @param dim dimension of points ; height of the matrices
* @param k number of neighbor to consider
* @param dist_host distances to k nearest neighbors ; pointer to linear matrix
* @param dist_host indexes of the k nearest neighbors ; pointer to linear matrix
*
*/
void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
// Grids and threads
dim3 g_16x16(query_nb/16, ref_nb/16, 1);
dim3 t_16x16(16, 16, 1);
if (query_nb%16 != 0) g_16x16.x += 1;
if (ref_nb %16 != 0) g_16x16.y += 1;
//
dim3 g_256x1(query_nb/256, 1, 1);
dim3 t_256x1(256, 1, 1);
if (query_nb%256 != 0) g_256x1.x += 1;
dim3 g_k_16x16(query_nb/16, k/16, 1);
dim3 t_k_16x16(16, 16, 1);
if (query_nb%16 != 0) g_k_16x16.x += 1;
if (k %16 != 0) g_k_16x16.y += 1;
// Kernel 1: Compute all the distances
cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb, query_dev, query_nb, dim, dist_dev);
// Kernel 2: Sort each column
cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev, query_nb, ref_nb, k);
// Kernel 3: Compute square root of k first elements
// cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k);
#if DEBUG
unsigned int size_of_float = sizeof(float);
unsigned long size_of_long = sizeof(long);
float* dist_host = new float[query_nb * k];
long* idx_host = new long[query_nb * k];
// Memory copy of output from device to host
cudaMemcpy(&dist_host[0], dist_dev,
query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
cudaMemcpy(&idx_host[0], ind_dev,
query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
int i = 0;
for(i = 0; i < 100; i++){
printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
}
#endif
}

View File

@@ -0,0 +1,9 @@
#pragma once
#include <torch/extension.h>
// #include <THC/THC.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
void knn_device(float* ref_dev, int ref_width,
float* query_dev, int query_width,
int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream);

View File

@@ -0,0 +1,75 @@
#pragma once
#include "cpu/vision.h"
#ifdef WITH_CUDA
#include "cuda/vision.h"
// #include <THC/THC.h>
// extern THCState *state;
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#endif
int knn(at::Tensor& ref, at::Tensor& query, at::Tensor& idx)
{
// TODO check dimensions
long batch, ref_nb, query_nb, dim, k;
batch = ref.size(0);
dim = ref.size(1);
k = idx.size(1);
ref_nb = ref.size(2);
query_nb = query.size(2);
// float *ref_dev = ref.data<float>();
// float *query_dev = query.data<float>();
// long *idx_dev = idx.data<long>();
float *ref_dev = ref.data_ptr<float>();
float *query_dev = query.data_ptr<float>();
long *idx_dev = idx.data_ptr<long>();
// if (ref.type().is_cuda()) {
if (ref.is_cuda()) {
#ifdef WITH_CUDA
// TODO raise error if not compiled with CUDA
// float *dist_dev = (float*)THCudaMalloc(state, ref_nb * query_nb * sizeof(float));
float *dist_dev = (float*)c10::cuda::CUDACachingAllocator::raw_alloc(ref_nb * query_nb * sizeof(float));
for (int b = 0; b < batch; b++)
{
// knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
// dist_dev, idx_dev + b * k * query_nb, THCState_getCurrentStream(state));
knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
dist_dev, idx_dev + b * k * query_nb, c10::cuda::getCurrentCUDAStream());
}
// THCudaFree(state, dist_dev);
c10::cuda::CUDACachingAllocator::raw_delete(dist_dev);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in knn: %s\n", cudaGetErrorString(err));
// THError("aborting");
}
return 1;
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
float *dist_dev = (float*)malloc(ref_nb * query_nb * sizeof(float));
long *ind_buf = (long*)malloc(ref_nb * sizeof(long));
for (int b = 0; b < batch; b++) {
knn_cpu(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
dist_dev, idx_dev + b * k * query_nb, ind_buf);
}
free(dist_dev);
free(ind_buf);
return 1;
}

View File

@@ -0,0 +1,5 @@
#include "knn.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("knn", &knn, "k-nearest neighbors");
}