diff --git a/app_split.py b/app_split.py index 4ad462f..86e26d0 100644 --- a/app_split.py +++ b/app_split.py @@ -1,9 +1,9 @@ from PytorchBoot.application import PytorchBootApplication -from runners.data_splitor import DataSplitor +from runners.data_spliter import DataSpliter @PytorchBootApplication("split") class DataSplitApp: @staticmethod def start(): - DataSplitor(r"configs\split_dataset_config.yaml").run() + DataSpliter(r"configs\split_dataset_config.yaml").run() \ No newline at end of file diff --git a/configs/train_config.yaml b/configs/train_config.yaml index 4def879..39ffc69 100644 --- a/configs/train_config.yaml +++ b/configs/train_config.yaml @@ -8,17 +8,70 @@ runner: experiment: name: debug root_dir: "experiments" - + use_checkpoint: False + epoch: -1 # -1 stands for last epoch + max_epochs: 5 + save_checkpoint_interval: 1 + test_first: False + train: + optimizer: + type: Adam + lr: 0.0001 + losses: + - mse_loss + dataset: OmniObject3d_train + test: + frequency: 3 # test frequency dataset_list: - OmniObject3d_train + + pipeline: nbv_reconstruction_pipeline datasets: OmniObject3d_train: root_dir: "C:\\Document\\Local Project\\nbv_rec\\data\\sample" split_file: "C:\\Document\\Local Project\\nbv_rec\\data\\OmniObject3d_train.txt" + ratio: 1.0 + batch_size: 1 + num_workers: 12 + pts_num: 2048 OmniObject3d_test: root_dir: "C:\\Document\\Local Project\\nbv_rec\\data\\sample" split_file: "C:\\Document\\Local Project\\nbv_rec\\data\\OmniObject3d_test.txt" + eval_list: + - pose_diff + ratio: 1.0 + batch_size: 1 + num_workers: 1 + pts_num: 2048 +module: + + pointnet_encoder: + in_dim: 3 + out_dim: 1024 + global_feat: True + feature_transform: False + + transformer_seq_encoder: + pts_embed_dim: 1024 + pose_embed_dim: 256 + num_heads: 4 + ffn_dim: 256 + num_layers: 3 + max_seq_len: 30 + output_dim: 2048 + + gf_view_finder: + regression_head: Rx_Ry_and_T + pose_mode: rot_matrix + per_point_feature: False + sample_mode: ode + sampling_steps: 500 + sde_mode: ve + + pose_encoder: + pose_dim: 9 + output_dim: 256 \ No newline at end of file diff --git a/core/dataset.py b/core/dataset.py index 1337452..8756563 100644 --- a/core/dataset.py +++ b/core/dataset.py @@ -1,6 +1,7 @@ import numpy as np from PytorchBoot.dataset import BaseDataset import PytorchBoot.stereotype as stereotype +from torch.nn.utils.rnn import pad_sequence import sys sys.path.append(r"C:\Document\Local Project\nbv_rec\nbv_reconstruction") @@ -18,7 +19,7 @@ class NBVReconstructionDataset(BaseDataset): self.split_file_path = config["split_file"] self.scene_name_list = self.load_scene_name_list() self.datalist = self.get_datalist() - self.pts_num = 1024 + self.pts_num = config["pts_num"] def load_scene_name_list(self): scene_name_list = [] @@ -76,13 +77,9 @@ class NBVReconstructionDataset(BaseDataset): nbv_idx, nbv_coverage_rate = nbv[0], nbv[1] nbv_path = DataLoadUtil.get_path(self.root_dir, scene_name, nbv_idx) - nbv_depth = DataLoadUtil.load_depth(nbv_path) cam_info = DataLoadUtil.load_cam_info(nbv_path) - nbv_mask = DataLoadUtil.load_seg(nbv_path) best_frame_to_world = cam_info["cam_to_world"] best_to_1_pose = np.dot(np.linalg.inv(first_frame_to_world), best_frame_to_world) - best_target_point_cloud = DataLoadUtil.get_target_point_cloud(nbv_depth, cam_info["cam_intrinsic"], best_to_1_pose, nbv_mask)["points_world"] - downsampled_best_target_point_cloud = PtsUtil.random_downsample_point_cloud(best_target_point_cloud, self.pts_num) best_to_1_6d = PoseUtil.matrix_to_rotation_6d_numpy(np.asarray(best_to_1_pose[:3,:3])) best_to_1_trans = best_to_1_pose[:3,3] best_to_1_9d = np.concatenate([best_to_1_6d, best_to_1_trans], axis=0) @@ -91,7 +88,6 @@ class NBVReconstructionDataset(BaseDataset): "scanned_pts": np.asarray(scanned_views_pts,dtype=np.float32), "scanned_coverage_rate": np.asarray(scanned_coverages_rate,dtype=np.float32), "scanned_n_to_1_pose_9d": np.asarray(scanned_n_to_1_pose,dtype=np.float32), - "best_pts": np.asarray(downsampled_best_target_point_cloud,dtype=np.float32), "best_coverage_rate": nbv_coverage_rate, "best_to_1_pose_9d": best_to_1_9d, "max_coverage_rate": max_coverage_rate, @@ -102,6 +98,27 @@ class NBVReconstructionDataset(BaseDataset): def __len__(self): return len(self.datalist) + + def get_collate_fn(self): + def collate_fn(batch): + scanned_pts = [item['scanned_pts'] for item in batch] + scanned_n_to_1_pose_9d = [item['scanned_n_to_1_pose_9d'] for item in batch] + rest = {} + for key in batch[0].keys(): + if key in ['scanned_pts', 'scanned_n_to_1_pose_9d']: + continue + if isinstance(batch[0][key], torch.Tensor): + rest[key] = torch.stack([item[key] for item in batch]) + elif isinstance(batch[0][key], str): + rest[key] = [item[key] for item in batch] + else: + rest[key] = [item[key] for item in batch] + return { + 'scanned_pts': scanned_pts, + 'scanned_n_to_1_pose_9d': scanned_n_to_1_pose_9d, + **rest + } + return collate_fn if __name__ == "__main__": import torch @@ -111,9 +128,10 @@ if __name__ == "__main__": config = { "root_dir": "C:\\Document\\Local Project\\nbv_rec\\data\\sample", "split_file": "C:\\Document\\Local Project\\nbv_rec\\data\\OmniObject3d_train.txt", - "ratio": 0.05, - "batch_size": 1, + "ratio": 0.5, + "batch_size": 2, "num_workers": 0, + "pts_num": 2048 } ds = NBVReconstructionDataset(config) print(len(ds)) @@ -126,11 +144,18 @@ if __name__ == "__main__": for pts in data["scanned_pts"][0]: #np.savetxt(f"pts_{cnt}.txt", pts) cnt+=1 - best_pts = data["best_pts"][0] #np.savetxt("best_pts.txt", best_pts) for key, value in data.items(): if isinstance(value, torch.Tensor): print(key, ":" ,value.shape) + else: + print(key, ":" ,len(value)) + if key == "scanned_n_to_1_pose_9d": + for val in value: + print(val.shape) + if key == "scanned_pts": + for val in value: + print(val.shape) print() \ No newline at end of file diff --git a/core/pipeline.py b/core/pipeline.py index 9adc68a..223f83b 100644 --- a/core/pipeline.py +++ b/core/pipeline.py @@ -17,6 +17,9 @@ class NBVReconstructionPipeline(nn.Module): def forward(self, data): mode = data["mode"] + # ----- Debug Trace ----- # + import ipdb; ipdb.set_trace() + # ------------------------ # if mode == namespace.Mode.TRAIN: return self.forward_train(data) elif mode == namespace.Mode.TEST: diff --git a/modules/pose_encoder.py b/modules/pose_encoder.py index aeeeb85..40b67fd 100644 --- a/modules/pose_encoder.py +++ b/modules/pose_encoder.py @@ -7,12 +7,13 @@ class PoseEncoder(nn.Module): super(PoseEncoder, self).__init__() self.config = config pose_dim = config["pose_dim"] + out_dim = config["out_dim"] self.act = nn.ReLU(True) self.pose_encoder = nn.Sequential( - nn.Linear(pose_dim, 256), + nn.Linear(pose_dim, out_dim), self.act, - nn.Linear(256, 256), + nn.Linear(out_dim, out_dim), self.act, ) diff --git a/modules/transformer_seq_encoder.py b/modules/transformer_seq_encoder.py index ba50f7d..79f151c 100644 --- a/modules/transformer_seq_encoder.py +++ b/modules/transformer_seq_encoder.py @@ -1,62 +1,78 @@ import torch from torch import nn - +from torch.nn.utils.rnn import pad_sequence import PytorchBoot.stereotype as stereotype + @stereotype.module("transformer_seq_encoder") class TransformerSequenceEncoder(nn.Module): def __init__(self, config): super(TransformerSequenceEncoder, self).__init__() self.config = config - embed_dim = config['pts_embed_dim'] + config['pose_embed_dim'] - self.positional_encoding = nn.Parameter(torch.zeros(1, config['max_seq_len'], embed_dim)) - encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=config['num_heads'], dim_feedforward=config['ffn_dim']) - self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config['num_layers']) - self.fc = nn.Linear(embed_dim, config['output_dim']) + embed_dim = config["pts_embed_dim"] + config["pose_embed_dim"] + encoder_layer = nn.TransformerEncoderLayer( + d_model=embed_dim, + nhead=config["num_heads"], + dim_feedforward=config["ffn_dim"], + batch_first=True, + ) + self.transformer_encoder = nn.TransformerEncoder( + encoder_layer, num_layers=config["num_layers"] + ) + self.fc = nn.Linear(embed_dim, config["output_dim"]) def encode_sequence(self, pts_embedding_list_batch, pose_embedding_list_batch): - batch_size = len(pts_embedding_list_batch) + # Combine features and pad sequences combined_features_batch = [] - - for i in range(batch_size): - combined_features = [torch.cat((pts_embed, pose_embed), dim=-1) - for pts_embed, pose_embed in zip(pts_embedding_list_batch[i][:-1], pose_embedding_list_batch[i][:-1])] - combined_features_batch.append(torch.stack(combined_features)) - - combined_tensor = torch.stack(combined_features_batch) # Shape: [batch_size, seq_len-1, embed_dim] - - # Adjust positional encoding to match batch size - pos_encoding = self.positional_encoding[:, :combined_tensor.size(1), :].repeat(batch_size, 1, 1) - combined_tensor = combined_tensor + pos_encoding + lengths = [] + for pts_embedding_list, pose_embedding_list in zip(pts_embedding_list_batch, pose_embedding_list_batch): + combined_features = [ + torch.cat((pts_embed, pose_embed), dim=-1) + for pts_embed, pose_embed in zip(pts_embedding_list, pose_embedding_list) + ] + combined_features_batch.append(torch.stack(combined_features)) + lengths.append(len(combined_features)) + + combined_tensor = pad_sequence(combined_features_batch, batch_first=True) # Shape: [batch_size, max_seq_len, embed_dim] + + # Prepare mask for padding + max_len = max(lengths) + padding_mask = torch.tensor([([0] * length + [1] * (max_len - length)) for length in lengths], dtype=torch.bool) # Transformer encoding - transformer_output = self.transformer_encoder(combined_tensor) - + transformer_output = self.transformer_encoder(combined_tensor, src_key_padding_mask=padding_mask) + # Mean pooling final_feature = transformer_output.mean(dim=1) - + # Fully connected layer final_output = self.fc(final_feature) - + return final_output + if __name__ == "__main__": config = { - 'pts_embed_dim': 1024, # 每个点云embedding的维度 - 'pose_embed_dim': 256, # 每个姿态embedding的维度 - 'num_heads': 4, # 多头注意力机制的头数 - 'ffn_dim': 256, # 前馈神经网络的维度 - 'num_layers': 3, # Transformer 编码层数 - 'max_seq_len': 10, # 最大序列长度 - 'output_dim': 2048, # 输出特征维度 + "pts_embed_dim": 1024, + "pose_embed_dim": 256, + "num_heads": 4, + "ffn_dim": 256, + "num_layers": 3, + "output_dim": 2048, } encoder = TransformerSequenceEncoder(config) - seq_len = 5 + seq_len = [5, 8, 9, 4] batch_size = 4 - - pts_embedding_list_batch = [torch.randn(seq_len, config['pts_embed_dim']) for _ in range(batch_size)] - pose_embedding_list_batch = [torch.randn(seq_len, config['pose_embed_dim']) for _ in range(batch_size)] - output_feature = encoder.encode_sequence(pts_embedding_list_batch, pose_embedding_list_batch) + + pts_embedding_list_batch = [ + torch.randn(seq_len[idx], config["pts_embed_dim"]) for idx in range(batch_size) + ] + pose_embedding_list_batch = [ + torch.randn(seq_len[idx], config["pose_embed_dim"]) for idx in range(batch_size) + ] + output_feature = encoder.encode_sequence( + pts_embedding_list_batch, pose_embedding_list_batch + ) print("Encoded Feature:", output_feature) print("Feature Shape:", output_feature.shape) diff --git a/runners/data_splitor.py b/runners/data_spliter.py similarity index 94% rename from runners/data_splitor.py rename to runners/data_spliter.py index eb4a661..12796c7 100644 --- a/runners/data_splitor.py +++ b/runners/data_spliter.py @@ -6,8 +6,8 @@ from PytorchBoot.utils import Log import PytorchBoot.stereotype as stereotype from PytorchBoot.status import status_manager -@stereotype.runner("data_splitor") -class DataSplitor(Runner): +@stereotype.runner("data_spliter") +class DataSpliter(Runner): def __init__(self, config): super().__init__(config) self.load_experiment("data_split")