RoboPoint Model

RoboPoint is a specialized multimodal model for robotic perception and understanding. With InferX, you can run RoboPoint on any device using the same API - perfect for robotics applications across different hardware platforms.

Features

  • Multimodal Understanding: Combines vision and language for comprehensive scene understanding
  • Keypoint Detection: Identifies manipulation points for robotic grasping
  • Cross-Platform: Same code works on Jetson, GPU, or CPU
  • Real-time Performance: Optimized for robotics applications
  • Natural Language: Understands human instructions in natural language

Installation

RoboPoint is included with InferX:

pip install git+https://github.com/exla-ai/InferX.git

Basic Usage

from inferx.models.robopoint import robopoint
import cv2

# Initialize the model
model = robopoint()

# Load an image
image = cv2.imread("path/to/your/image.jpg")

# Run inference with text instruction
results = model.inference(
    image=image,
    text_instruction="Find grasping points on the cup"
)

# Process results
print(f"Detected keypoints: {len(results['keypoints'])}")
for i, keypoint in enumerate(results['keypoints']):
    print(f"Keypoint {i}: ({keypoint['x']}, {keypoint['y']}) - confidence: {keypoint['confidence']}")

Advanced Usage

Interactive Robot Control

from inferx.models.robopoint import robopoint
import cv2

# Initialize model
model = robopoint()

# Capture from camera
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Get user instruction
    instruction = input("Enter instruction (or 'quit'): ")
    if instruction.lower() == 'quit':
        break
    
    # Run inference
    results = model.inference(
        image=frame,
        text_instruction=instruction
    )
    
    # Visualize keypoints
    for keypoint in results['keypoints']:
        cv2.circle(frame, 
                  (int(keypoint['x']), int(keypoint['y'])), 
                  5, (0, 255, 0), -1)
        cv2.putText(frame, f"{keypoint['confidence']:.2f}", 
                   (int(keypoint['x']), int(keypoint['y']-10)),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    
    cv2.imshow('RoboPoint', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Batch Processing for Dataset Creation

from inferx.models.robopoint import robopoint
import os
import json

model = robopoint()

# Process multiple images with different tasks
tasks = [
    "Find grasping points on objects",
    "Identify manipulation areas",
    "Locate safe handling points"
]

results_dataset = []

for filename in os.listdir("robot_images/"):
    if filename.endswith(('.jpg', '.png')):
        image_path = f"robot_images/{filename}"
        image = cv2.imread(image_path)
        
        for task in tasks:
            results = model.inference(
                image=image,
                text_instruction=task
            )
            
            # Store results
            results_dataset.append({
                'image': filename,
                'task': task,
                'keypoints': results['keypoints']
            })

# Save dataset
with open('robot_keypoints_dataset.json', 'w') as f:
    json.dump(results_dataset, f, indent=2)

Example Instructions

RoboPoint understands various natural language instructions:

  • Grasping: “Find the best grasping points on the bottle”
  • Manipulation: “Identify areas where I can safely manipulate this object”
  • Spatial Understanding: “Show me empty spaces on the table”
  • Object-specific: “Find spots on the rightmost plate where I can place items”

Performance

InferX optimizes RoboPoint for your hardware:

HardwareInference TimeMemory Usage
Jetson AGX Orin~150ms~3GB
RTX 4090~60ms~4GB
Intel i7 CPU~800ms~2GB

Response Format

{
    'keypoints': [
        {
            'x': float,           # X coordinate
            'y': float,           # Y coordinate
            'confidence': float,  # Confidence score (0-1)
            'type': str          # Keypoint type (grasp, manipulation, etc.)
        }
    ],
    'regions': [
        {
            'bbox': [x1, y1, x2, y2],  # Bounding box
            'confidence': float,
            'description': str
        }
    ]
}

Integration with Robot Frameworks

ROS Integration

#!/usr/bin/env python3
import rospy
from sensor_msgs.msg import Image
from cv_bridge import CvBridge
from inferx.models.robopoint import robopoint

class RoboPointNode:
    def __init__(self):
        rospy.init_node('robopoint_node')
        self.bridge = CvBridge()
        self.model = robopoint()
        
        # Subscribe to camera topic
        self.image_sub = rospy.Subscriber('/camera/image_raw', Image, self.image_callback)
        
        # Publisher for keypoints
        self.keypoint_pub = rospy.Publisher('/robopoint/keypoints', KeyPointArray, queue_size=1)
    
    def image_callback(self, msg):
        # Convert ROS image to OpenCV
        cv_image = self.bridge.imgmsg_to_cv2(msg, "bgr8")
        
        # Get instruction from parameter server
        instruction = rospy.get_param('~instruction', 'Find grasping points')
        
        # Run inference
        results = self.model.inference(
            image=cv_image,
            text_instruction=instruction
        )
        
        # Publish results
        self.publish_keypoints(results['keypoints'])
    
    def publish_keypoints(self, keypoints):
        # Convert to ROS message and publish
        # Implementation depends on your message type
        pass

if __name__ == '__main__':
    node = RoboPointNode()
    rospy.spin()

Hardware Detection

✨ InferX - RoboPoint Model ✨
🔍 Device Detected: AGX_ORIN
⠏ [0.8s] Loading RoboPoint model
✓ [1.0s] Ready for robotic perception

Example Applications

  • Pick and Place: Identify optimal grasping points for robotic arms
  • Object Manipulation: Find safe areas to push, pull, or rotate objects
  • Scene Understanding: Understand spatial relationships for navigation
  • Human-Robot Interaction: Interpret natural language commands

Next Steps

RoboPoint Model

RoboPoint is a specialized multimodal model for robotic perception and understanding. With InferX, you can run RoboPoint on any device using the same API - perfect for robotics applications across different hardware platforms.

Features

  • Multimodal Understanding: Combines vision and language for comprehensive scene understanding
  • Keypoint Detection: Identifies manipulation points for robotic grasping
  • Cross-Platform: Same code works on Jetson, GPU, or CPU
  • Real-time Performance: Optimized for robotics applications
  • Natural Language: Understands human instructions in natural language

Installation

RoboPoint is included with InferX:

pip install git+https://github.com/exla-ai/InferX.git

Basic Usage

from inferx.models.robopoint import robopoint
import cv2

# Initialize the model
model = robopoint()

# Load an image
image = cv2.imread("path/to/your/image.jpg")

# Run inference with text instruction
results = model.inference(
    image=image,
    text_instruction="Find grasping points on the cup"
)

# Process results
print(f"Detected keypoints: {len(results['keypoints'])}")
for i, keypoint in enumerate(results['keypoints']):
    print(f"Keypoint {i}: ({keypoint['x']}, {keypoint['y']}) - confidence: {keypoint['confidence']}")

Advanced Usage

Interactive Robot Control

from inferx.models.robopoint import robopoint
import cv2

# Initialize model
model = robopoint()

# Capture from camera
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Get user instruction
    instruction = input("Enter instruction (or 'quit'): ")
    if instruction.lower() == 'quit':
        break
    
    # Run inference
    results = model.inference(
        image=frame,
        text_instruction=instruction
    )
    
    # Visualize keypoints
    for keypoint in results['keypoints']:
        cv2.circle(frame, 
                  (int(keypoint['x']), int(keypoint['y'])), 
                  5, (0, 255, 0), -1)
        cv2.putText(frame, f"{keypoint['confidence']:.2f}", 
                   (int(keypoint['x']), int(keypoint['y']-10)),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    
    cv2.imshow('RoboPoint', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Batch Processing for Dataset Creation

from inferx.models.robopoint import robopoint
import os
import json

model = robopoint()

# Process multiple images with different tasks
tasks = [
    "Find grasping points on objects",
    "Identify manipulation areas",
    "Locate safe handling points"
]

results_dataset = []

for filename in os.listdir("robot_images/"):
    if filename.endswith(('.jpg', '.png')):
        image_path = f"robot_images/{filename}"
        image = cv2.imread(image_path)
        
        for task in tasks:
            results = model.inference(
                image=image,
                text_instruction=task
            )
            
            # Store results
            results_dataset.append({
                'image': filename,
                'task': task,
                'keypoints': results['keypoints']
            })

# Save dataset
with open('robot_keypoints_dataset.json', 'w') as f:
    json.dump(results_dataset, f, indent=2)

Example Instructions

RoboPoint understands various natural language instructions:

  • Grasping: “Find the best grasping points on the bottle”
  • Manipulation: “Identify areas where I can safely manipulate this object”
  • Spatial Understanding: “Show me empty spaces on the table”
  • Object-specific: “Find spots on the rightmost plate where I can place items”

Performance

InferX optimizes RoboPoint for your hardware:

HardwareInference TimeMemory Usage
Jetson AGX Orin~150ms~3GB
RTX 4090~60ms~4GB
Intel i7 CPU~800ms~2GB

Response Format

{
    'keypoints': [
        {
            'x': float,           # X coordinate
            'y': float,           # Y coordinate
            'confidence': float,  # Confidence score (0-1)
            'type': str          # Keypoint type (grasp, manipulation, etc.)
        }
    ],
    'regions': [
        {
            'bbox': [x1, y1, x2, y2],  # Bounding box
            'confidence': float,
            'description': str
        }
    ]
}

Integration with Robot Frameworks

ROS Integration

#!/usr/bin/env python3
import rospy
from sensor_msgs.msg import Image
from cv_bridge import CvBridge
from inferx.models.robopoint import robopoint

class RoboPointNode:
    def __init__(self):
        rospy.init_node('robopoint_node')
        self.bridge = CvBridge()
        self.model = robopoint()
        
        # Subscribe to camera topic
        self.image_sub = rospy.Subscriber('/camera/image_raw', Image, self.image_callback)
        
        # Publisher for keypoints
        self.keypoint_pub = rospy.Publisher('/robopoint/keypoints', KeyPointArray, queue_size=1)
    
    def image_callback(self, msg):
        # Convert ROS image to OpenCV
        cv_image = self.bridge.imgmsg_to_cv2(msg, "bgr8")
        
        # Get instruction from parameter server
        instruction = rospy.get_param('~instruction', 'Find grasping points')
        
        # Run inference
        results = self.model.inference(
            image=cv_image,
            text_instruction=instruction
        )
        
        # Publish results
        self.publish_keypoints(results['keypoints'])
    
    def publish_keypoints(self, keypoints):
        # Convert to ROS message and publish
        # Implementation depends on your message type
        pass

if __name__ == '__main__':
    node = RoboPointNode()
    rospy.spin()

Hardware Detection

✨ InferX - RoboPoint Model ✨
🔍 Device Detected: AGX_ORIN
⠏ [0.8s] Loading RoboPoint model
✓ [1.0s] Ready for robotic perception

Example Applications

  • Pick and Place: Identify optimal grasping points for robotic arms
  • Object Manipulation: Find safe areas to push, pull, or rotate objects
  • Scene Understanding: Understand spatial relationships for navigation
  • Human-Robot Interaction: Interpret natural language commands

Next Steps