diff --git a/README.md b/README.md index af762ca..13b3ca6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ # Roadmap -* Change simulation scenes * Interface with GS-Net # Updated installation steps fo my PC environment diff --git a/cfg/active_grasp.yaml b/cfg/active_grasp.yaml index dfe9b55..471a5d1 100644 --- a/cfg/active_grasp.yaml +++ b/cfg/active_grasp.yaml @@ -2,7 +2,7 @@ bt_sim: gui: True gripper_force: 10 # scene: random - scene: $(find active_grasp)/cfg/sim/challenging_scene_1.yaml + scene: $(find active_grasp)/cfg/sim/challenging_scene_2.yaml hw: roi_calib_file: $(find active_grasp)/cfg/hw/T_base_tag.txt diff --git a/cfg/sim/challenging_scene_1.yaml b/cfg/sim/challenging_scene_1.yaml index 9b7ce54..9181f11 100644 --- a/cfg/sim/challenging_scene_1.yaml +++ b/cfg/sim/challenging_scene_1.yaml @@ -2,10 +2,10 @@ center: [0.5, 0.0, 0.25] q: [0.0, -1.39, 0.0, -2.36, 0.0, 1.57, 0.79] objects: - object_id: ycb/006_mustard_bottle - xyz: [0.0, 0.0, 1.6] + xyz: [0.0, 0.0, 0.03] rpy: [0, 0, 0] scale: 0.8 -- object_id: active_perception/box2 - xyz: [0.0, 0.0, 1.3] +- object_id: active_perception/box + xyz: [0.0, 0.0, 0.0] rpy: [0, 0, 0] - scale: 0.85 + scale: 0.8 diff --git a/cfg/sim/challenging_scene_2.yaml b/cfg/sim/challenging_scene_2.yaml new file mode 100644 index 0000000..c35912c --- /dev/null +++ b/cfg/sim/challenging_scene_2.yaml @@ -0,0 +1,11 @@ +center: [0.5, 0.0, 0.25] +q: [0.0, -1.39, 0.0, -2.36, 0.0, 1.57, 0.79] +objects: +- object_id: ycb/006_mustard_bottle + xyz: [0.0, 0.0, 0.03] + rpy: [0, 0, 0] + scale: 0.8 +- object_id: active_perception/box2 + xyz: [0.19, 0.0, 0.0] + rpy: [0, 0, 0] + scale: 1.2 diff --git a/src/active_grasp/__init__.py b/src/active_grasp/__init__.py index 5833f63..9e5127e 100644 --- a/src/active_grasp/__init__.py +++ b/src/active_grasp/__init__.py @@ -8,5 +8,5 @@ register("top-view", TopView) register("top-trajectory", TopTrajectory) register("fixed-trajectory", FixedTrajectory) register("nbv", NextBestView) -register("ap-multi-view", ActivePerceptionMultiViewPolicy) +# register("ap-multi-view", ActivePerceptionMultiViewPolicy) register("ap-single-view", ActivePerceptionSingleViewPolicy) \ No newline at end of file diff --git a/src/active_grasp/active_perception_policy.py b/src/active_grasp/active_perception_policy.py index ab3ec3c..b05599b 100644 --- a/src/active_grasp/active_perception_policy.py +++ b/src/active_grasp/active_perception_policy.py @@ -8,9 +8,9 @@ from .active_perception_demo import APInferenceEngine from robot_helpers.spatial import Transform import torch import torch.nn.functional as F - - +import requests import matplotlib.pyplot as plt +from vgn.grasp import ParallelJawGrasp class RealTime3DVisualizer: @@ -51,146 +51,152 @@ class RealTime3DVisualizer: plt.pause(0.001) -class ActivePerceptionMultiViewPolicy(MultiViewPolicy): - def __init__(self): - super().__init__() - self.max_views = rospy.get_param("ap_grasp/max_views") - self.ap_config_path = rospy.get_param("ap_grasp/ap_config_path") - self.max_inference_num = rospy.get_param("ap_grasp/max_inference_num") - self.ap_inference_engine = APInferenceEngine(self.ap_config_path) - self.pcdvis = RealTime3DVisualizer() +# class ActivePerceptionMultiViewPolicy(MultiViewPolicy): +# def __init__(self): +# super().__init__() +# self.max_views = rospy.get_param("ap_grasp/max_views") +# self.ap_config_path = rospy.get_param("ap_grasp/ap_config_path") +# self.max_inference_num = rospy.get_param("ap_grasp/max_inference_num") +# self.ap_inference_engine = APInferenceEngine(self.ap_config_path) +# self.pcdvis = RealTime3DVisualizer() - def update(self, img, seg, target_id, x, q): - if len(self.views) > self.max_views or self.best_grasp_prediction_is_stable(): - self.done = True - else: - with Timer("state_update"): - self.integrate(img, x, q) +# def update(self, img, seg, target_id, x, q): +# if len(self.views) > self.max_views or self.best_grasp_prediction_is_stable(): +# self.done = True +# else: +# with Timer("state_update"): +# self.integrate(img, x, q) - # When policy hasn't produced an available grasp - c = 0 - while(c < self.max_inference_num): - # Inference with our model - target_points, scene_points = self.depth_image_to_ap_input(img, seg, target_id) - ap_input = {'target_pts': target_points, - 'scene_pts': scene_points} - ap_output = self.ap_inference_engine.inference(ap_input) - c += 1 - delta_rot_6d = ap_output['estimated_delta_rot_6d'] +# # When policy hasn't produced an available grasp +# c = 0 +# while(c < self.max_inference_num): +# # Inference with our model +# target_points, scene_points = self.depth_image_to_ap_input(img, seg, target_id) +# ap_input = {'target_pts': target_points, +# 'scene_pts': scene_points} +# ap_output = self.ap_inference_engine.inference(ap_input) +# c += 1 +# delta_rot_6d = ap_output['estimated_delta_rot_6d'] - current_cam_pose = torch.from_numpy(x.as_matrix()).float().to("cuda:0") - est_delta_rot_mat = self.rotation_6d_to_matrix_tensor_batch(delta_rot_6d)[0] - look_at_center = torch.from_numpy(self.bbox.center).float().to("cuda:0") - nbv_tensor = self.get_transformed_mat(current_cam_pose, - est_delta_rot_mat, - look_at_center) - nbv_numpy = nbv_tensor.cpu().numpy() - nbv_transform = Transform.from_matrix(nbv_numpy) - x_d = nbv_transform +# current_cam_pose = torch.from_numpy(x.as_matrix()).float().to("cuda:0") +# est_delta_rot_mat = self.rotation_6d_to_matrix_tensor_batch(delta_rot_6d)[0] +# look_at_center = torch.from_numpy(self.bbox.center).float().to("cuda:0") +# nbv_tensor = self.get_transformed_mat(current_cam_pose, +# est_delta_rot_mat, +# look_at_center) +# nbv_numpy = nbv_tensor.cpu().numpy() +# nbv_transform = Transform.from_matrix(nbv_numpy) +# x_d = nbv_transform - # Check if this pose available - if(self.solve_cam_ik(self.q0, x_d)): - self.x_d = x_d - self.updated = True - print("Found an NBV!") - break +# # Check if this pose available +# if(self.solve_cam_ik(self.q0, x_d)): +# self.x_d = x_d +# self.updated = True +# print("Found an NBV!") +# break - def vis_cam_pose(self, x): - # Integrate - self.views.append(x) - self.vis.path(self.base_frame, self.intrinsic, self.views) +# def vis_cam_pose(self, x): +# # Integrate +# self.views.append(x) +# self.vis.path(self.base_frame, self.intrinsic, self.views) - def vis_scene_cloud(self, img, x): - self.tsdf.integrate(img, self.intrinsic, x.inv() * self.T_base_task) - scene_cloud = self.tsdf.get_scene_cloud() - self.vis.scene_cloud(self.task_frame, np.asarray(scene_cloud.points)) +# def vis_scene_cloud(self, img, x): +# self.tsdf.integrate(img, self.intrinsic, x.inv() * self.T_base_task) +# scene_cloud = self.tsdf.get_scene_cloud() +# self.vis.scene_cloud(self.task_frame, np.asarray(scene_cloud.points)) - def rotation_6d_to_matrix_tensor_batch(self, d6: torch.Tensor) -> torch.Tensor: - a1, a2 = d6[..., :3], d6[..., 3:] - b1 = F.normalize(a1, dim=-1) - b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1 - b2 = F.normalize(b2, dim=-1) - b3 = torch.cross(b1, b2, dim=-1) - return torch.stack((b1, b2, b3), dim=-2) +# def rotation_6d_to_matrix_tensor_batch(self, d6: torch.Tensor) -> torch.Tensor: +# a1, a2 = d6[..., :3], d6[..., 3:] +# b1 = F.normalize(a1, dim=-1) +# b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1 +# b2 = F.normalize(b2, dim=-1) +# b3 = torch.cross(b1, b2, dim=-1) +# return torch.stack((b1, b2, b3), dim=-2) - def get_transformed_mat(self, src_mat, delta_rot, target_center_w): - src_rot = src_mat[:3, :3] - dst_rot = src_rot @ delta_rot.T - dst_mat = torch.eye(4).to(dst_rot.device) - dst_mat[:3, :3] = dst_rot - distance = torch.norm(target_center_w - src_mat[:3, 3]) - z_axis_camera = dst_rot[:3, 2].reshape(-1) - new_camera_position_w = target_center_w - distance * z_axis_camera - dst_mat[:3, 3] = new_camera_position_w - return dst_mat +# def get_transformed_mat(self, src_mat, delta_rot, target_center_w): +# src_rot = src_mat[:3, :3] +# dst_rot = src_rot @ delta_rot.T +# dst_mat = torch.eye(4).to(dst_rot.device) +# dst_mat[:3, :3] = dst_rot +# distance = torch.norm(target_center_w - src_mat[:3, 3]) +# z_axis_camera = dst_rot[:3, 2].reshape(-1) +# new_camera_position_w = target_center_w - distance * z_axis_camera +# dst_mat[:3, 3] = new_camera_position_w +# return dst_mat - def depth_image_to_ap_input(self, depth_img, seg_img, target_id): - target_points = [] - scene_points = [] +# def depth_image_to_ap_input(self, depth_img, seg_img, target_id): +# target_points = [] +# scene_points = [] - K = self.intrinsic.K - depth_shape = depth_img.shape - seg_shape = seg_img.shape - if(depth_shape == seg_shape): - img_shape = depth_shape - else: - print("Depth image shape and segmentation image shape are not the same") - return None +# K = self.intrinsic.K +# depth_shape = depth_img.shape +# seg_shape = seg_img.shape +# if(depth_shape == seg_shape): +# img_shape = depth_shape +# else: +# print("Depth image shape and segmentation image shape are not the same") +# return None - # Convert depth image to 3D points - u_indices , v_indices = np.meshgrid(np.arange(img_shape[1]), np.arange(img_shape[0])) - x_factors = (u_indices - K[0, 2]) / K[0, 0] - y_factors = (v_indices - K[1, 2]) / K[1, 1] - z_mat = depth_img - x_mat = x_factors * z_mat - y_mat = y_factors * z_mat - for i in range(img_shape[0]): - for j in range(img_shape[1]): - seg_id = seg_img[i, j] - x = x_mat[i][j] - y = y_mat[i][j] - z = z_mat[i][j] - if(int(seg_id) == int(target_id)): - # This pixel belongs to the target object to be grasped - target_points.append([x,y,z]) - else: - # This pixel belongs to the scene - scene_points.append([x,y,z]) +# # Convert depth image to 3D points +# u_indices , v_indices = np.meshgrid(np.arange(img_shape[1]), np.arange(img_shape[0])) +# x_factors = (u_indices - K[0, 2]) / K[0, 0] +# y_factors = (v_indices - K[1, 2]) / K[1, 1] +# z_mat = depth_img +# x_mat = x_factors * z_mat +# y_mat = y_factors * z_mat +# for i in range(img_shape[0]): +# for j in range(img_shape[1]): +# seg_id = seg_img[i, j] +# x = x_mat[i][j] +# y = y_mat[i][j] +# z = z_mat[i][j] +# if(int(seg_id) == int(target_id)): +# # This pixel belongs to the target object to be grasped +# target_points.append([x,y,z]) +# else: +# # This pixel belongs to the scene +# scene_points.append([x,y,z]) - target_points = np.asarray(target_points) - target_points = target_points.reshape(1, target_points.shape[0], 3) - # self.pcdvis.update_points(target_points) - target_points = torch.from_numpy(target_points).float().to("cuda:0") - scene_points = np.asarray(scene_points) - scene_points = scene_points.reshape(1, scene_points.shape[0], 3) - scene_points = torch.from_numpy(scene_points).float().to("cuda:0") +# target_points = np.asarray(target_points) +# target_points = target_points.reshape(1, target_points.shape[0], 3) +# # self.pcdvis.update_points(target_points) +# target_points = torch.from_numpy(target_points).float().to("cuda:0") +# scene_points = np.asarray(scene_points) +# scene_points = scene_points.reshape(1, scene_points.shape[0], 3) +# scene_points = torch.from_numpy(scene_points).float().to("cuda:0") - return target_points, scene_points +# return target_points, scene_points - def best_grasp_prediction_is_stable(self): - if self.best_grasp: - t = (self.T_task_base * self.best_grasp.pose).translation - i, j, k = (t / self.tsdf.voxel_size).astype(int) - qs = self.qual_hist[:, i, j, k] - if np.count_nonzero(qs) == self.T and np.mean(qs) > 0.9: - return True - return False +# def best_grasp_prediction_is_stable(self): +# if self.best_grasp: +# t = (self.T_task_base * self.best_grasp.pose).translation +# i, j, k = (t / self.tsdf.voxel_size).astype(int) +# qs = self.qual_hist[:, i, j, k] +# if np.count_nonzero(qs) == self.T and np.mean(qs) > 0.9: +# return True +# return False class ActivePerceptionSingleViewPolicy(SingleViewPolicy): - def __init__(self): + def __init__(self, flask_base_url="http://127.0.0.1:5000"): super().__init__() self.max_views = rospy.get_param("ap_grasp/max_views") self.ap_config_path = rospy.get_param("ap_grasp/ap_config_path") self.ap_inference_engine = APInferenceEngine(self.ap_config_path) self.pcdvis = RealTime3DVisualizer() self.updated = False + self._base_url = flask_base_url + def request_grasping_pose(self, data): + response = requests.post(f"{self._base_url}/get_gsnet_grasp", json=data) + return response.json() + + def update(self, img, seg, target_id, x, q): # Visualize scene cloud self.vis_scene_cloud(img, x) @@ -201,8 +207,8 @@ class ActivePerceptionSingleViewPolicy(SingleViewPolicy): # When policy hasn't produced an available grasp while(self.updated == False): # Inference with our model - target_points, scene_points = self.depth_image_to_ap_input(img, seg, target_id) - ap_input = {'target_pts': target_points, + self.target_points, scene_points = self.depth_image_to_ap_input(img, seg, target_id) + ap_input = {'target_pts': self.target_points, 'scene_pts': scene_points} ap_output = self.ap_inference_engine.inference(ap_input) delta_rot_6d = ap_output['estimated_delta_rot_6d'] @@ -210,7 +216,7 @@ class ActivePerceptionSingleViewPolicy(SingleViewPolicy): current_cam_pose = torch.from_numpy(x.as_matrix()).float().to("cuda:0") est_delta_rot_mat = self.rotation_6d_to_matrix_tensor_batch(delta_rot_6d)[0] - target_points_np = target_points.cpu().numpy()[0,:,:] + target_points_np = self.target_points.cpu().numpy()[0,:,:] central_point_of_target = np.mean(target_points_np, axis=0) look_at_center = torch.from_numpy(central_point_of_target).float().to("cuda:0") # Convert look_at_center's reference frame to arm frame @@ -234,9 +240,18 @@ class ActivePerceptionSingleViewPolicy(SingleViewPolicy): self.updated = True print("Found an NBV!") return - # Policy has produced an available grasp + # Policy has produced an available nbv if(self.updated == True): - self.generate_grasp(q) + data = np.asarray(self.target_points.cpu().numpy())[0].tolist() + best_grasp_T = np.asarray(self.request_grasping_pose(data)) + current_cam_pose = torch.from_numpy(x.as_matrix()).float().to("cuda:0") + best_grasp_T = current_cam_pose.cpu().numpy() @ best_grasp_T + pose = Transform.from_matrix(best_grasp_T) + width = 0.1 + grasp = ParallelJawGrasp(pose, width) + self.best_grasp = grasp + self.vis.grasp(self.base_frame, self.best_grasp, 0.9) + # self.generate_grasp(q) self.done = True def deactivate(self): @@ -253,6 +268,19 @@ class ActivePerceptionSingleViewPolicy(SingleViewPolicy): scene_cloud = self.tsdf.get_scene_cloud() self.vis.scene_cloud(self.task_frame, np.asarray(scene_cloud.points)) + def generate_gsnet_grasp(self, q): + tsdf_grid = self.tsdf.get_grid() + out = self.vgn.predict(tsdf_grid) + self.vis.quality(self.task_frame, self.tsdf.voxel_size, out.qual, 0.9) + grasps, qualities = self.filter_grasps(out, q) + + if len(grasps) > 0: + self.best_grasp, quality = self.select_best_grasp(grasps, qualities) + self.vis.grasp(self.base_frame, self.best_grasp, quality) + else: + self.best_grasp = None + self.vis.clear_grasp() + def generate_grasp(self, q): tsdf_grid = self.tsdf.get_grid() out = self.vgn.predict(tsdf_grid) diff --git a/src/active_grasp/controller.py b/src/active_grasp/controller.py index e10a256..9338865 100644 --- a/src/active_grasp/controller.py +++ b/src/active_grasp/controller.py @@ -107,7 +107,7 @@ class GraspController: self.view_sphere = ViewHalfSphere(bbox, self.min_z_dist) self.policy.activate(bbox, self.view_sphere) timer = rospy.Timer(rospy.Duration(1.0 / self.control_rate), self.send_vel_cmd) - r = rospy.Rate(self.policy_rate) + r = rospy.Rate(self.control_rate) if(self.policy.policy_type=="single_view"): while not self.policy.done: @@ -127,6 +127,9 @@ class GraspController: # Arrived moving_to_The_target = False r.sleep() + # sleep 3s + for i in range(self.control_rate*3): + r.sleep() elif(self.policy.policy_type=="multi_view"): while not self.policy.done: depth_img, seg_image, pose, q = self.get_state()