Custom Models with InferX

InferX provides powerful tools to optimize and deploy your own custom models across different hardware platforms. Whether you have a PyTorch, TensorFlow, or ONNX model, InferX can help you achieve optimal performance on any device.

Optimization Pipeline

from inferx.optimize import optimize_model

# Optimize your custom model
optimized_model = optimize_model(
    model_path="path/to/your/model.pth",
    target_device="auto",  # Auto-detects your hardware
    optimization_level="aggressive"
)

# Use the optimized model
result = optimized_model.inference(input_data)

Supported Model Formats

InferX supports multiple model formats and frameworks:

PyTorch Models

import torch
from inferx.optimize import optimize_model

# Load your PyTorch model
model = torch.load("my_model.pth")

# Optimize for current hardware
optimized = optimize_model(
    model=model,
    input_shape=(1, 3, 224, 224),
    optimization_level="balanced"
)

# Deploy with automatic hardware detection
result = optimized.inference(input_tensor)

TensorFlow/Keras Models

import tensorflow as tf
from inferx.optimize import optimize_model

# Load TensorFlow model
model = tf.keras.models.load_model("my_model.h5")

# Optimize for edge deployment
optimized = optimize_model(
    model=model,
    target_device="jetson",
    precision="fp16"  # Use half precision for speed
)

ONNX Models

from inferx.optimize import optimize_model

# Optimize ONNX model
optimized = optimize_model(
    model_path="model.onnx",
    target_device="gpu",
    batch_size=4
)

Hardware-Specific Optimizations

Jetson Optimization

# Optimize specifically for Jetson devices
jetson_model = optimize_model(
    model_path="model.pth",
    target_device="jetson",
    optimization_config={
        "precision": "fp16",
        "max_workspace_size": "1GB",
        "dla_cores": 2,  # Use Deep Learning Accelerator
        "enable_tensorrt": True
    }
)

GPU Optimization

# Optimize for high-end GPUs
gpu_model = optimize_model(
    model_path="model.pth", 
    target_device="gpu",
    optimization_config={
        "precision": "mixed",  # Mixed precision training
        "batch_size": 32,
        "enable_tensorrt": True,
        "enable_cudnn": True
    }
)

CPU Optimization

# Optimize for CPU deployment
cpu_model = optimize_model(
    model_path="model.pth",
    target_device="cpu",
    optimization_config={
        "num_threads": 8,
        "enable_mkldnn": True,
        "quantization": "int8"
    }
)

Advanced Optimization Features

Dynamic Batch Size

# Support variable batch sizes
model = optimize_model(
    model_path="model.pth",
    input_shape=[(1, 3, 224, 224), (4, 3, 224, 224), (8, 3, 224, 224)],
    dynamic_axes={'input': {0: 'batch_size'}}
)

# Use with different batch sizes
single_result = model.inference(single_image)
batch_result = model.inference(batch_images)

Quantization

# Apply quantization for smaller models
quantized_model = optimize_model(
    model_path="large_model.pth",
    quantization_config={
        "method": "dynamic",  # or "static", "qat"
        "precision": "int8",
        "calibration_dataset": calibration_data
    }
)

print(f"Model size reduced by {quantized_model.compression_ratio:.2f}x")

Pruning

# Apply model pruning to reduce parameters
pruned_model = optimize_model(
    model_path="model.pth",
    pruning_config={
        "method": "magnitude",
        "sparsity": 0.5,  # Remove 50% of parameters
        "structured": False
    }
)

Performance Monitoring

Benchmarking

from inferx.benchmark import benchmark_model

# Comprehensive performance analysis
results = benchmark_model(
    model=optimized_model,
    input_shape=(1, 3, 224, 224),
    num_runs=1000,
    warmup_runs=50
)

print(f"Average inference time: {results['avg_time']:.2f}ms")
print(f"Throughput: {results['throughput']:.1f} FPS")
print(f"Memory usage: {results['memory_mb']:.1f}MB")

Real-time Monitoring

import time
import psutil

class ModelMonitor:
    def __init__(self, model):
        self.model = model
        self.metrics = []
    
    def inference_with_monitoring(self, input_data):
        start_time = time.time()
        cpu_before = psutil.cpu_percent()
        memory_before = psutil.virtual_memory().percent
        
        # Run inference
        result = self.model.inference(input_data)
        
        end_time = time.time()
        cpu_after = psutil.cpu_percent()
        memory_after = psutil.virtual_memory().percent
        
        # Log metrics
        metrics = {
            'inference_time': end_time - start_time,
            'cpu_usage': cpu_after - cpu_before,
            'memory_delta': memory_after - memory_before,
            'timestamp': time.time()
        }
        
        self.metrics.append(metrics)
        return result, metrics

# Usage
monitor = ModelMonitor(optimized_model)
result, perf = monitor.inference_with_monitoring(test_input)
print(f"Inference took {perf['inference_time']*1000:.2f}ms")

Example: Complete Custom Model Pipeline

1. Model Training (External)

# Your existing training code
import torch
import torch.nn as nn

class CustomClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.backbone = torchvision.models.resnet18(pretrained=True)
        self.backbone.fc = nn.Linear(512, num_classes)
    
    def forward(self, x):
        return self.backbone(x)

# Train your model
model = CustomClassifier(num_classes=10)
# ... training code ...
torch.save(model.state_dict(), "custom_classifier.pth")

2. InferX Optimization

from inferx.optimize import optimize_model

# Load and optimize the trained model
model_state = torch.load("custom_classifier.pth")
model = CustomClassifier(num_classes=10)
model.load_state_dict(model_state)

# Optimize for deployment
optimized_model = optimize_model(
    model=model,
    input_shape=(1, 3, 224, 224),
    target_device="auto",
    optimization_config={
        "precision": "fp16",
        "enable_tensorrt": True,
        "optimization_level": "aggressive"
    }
)

# Save optimized model
optimized_model.save("custom_classifier_optimized.inferx")

3. Deployment

from inferx.models import load_model
import cv2
import numpy as np

# Load optimized model
model = load_model("custom_classifier_optimized.inferx")

# Preprocess function
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (224, 224))
    image = image.astype(np.float32) / 255.0
    image = np.transpose(image, (2, 0, 1))  # HWC to CHW
    image = np.expand_dims(image, axis=0)   # Add batch dimension
    return image

# Run inference
image = preprocess_image("test_image.jpg")
result = model.inference(image)

print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.3f}")

Production Deployment

Docker Container

FROM inferx/runtime:latest

# Copy your optimized model
COPY custom_classifier_optimized.inferx /app/model/

# Copy application code
COPY app.py /app/

# Install dependencies
RUN pip install opencv-python numpy

# Expose port
EXPOSE 8080

# Run application
CMD ["python", "/app/app.py"]

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: custom-model-service
spec:
  replicas: 3
  selector:
    matchLabels:
      app: custom-model
  template:
    metadata:
      labels:
        app: custom-model
    spec:
      containers:
      - name: inferx-model
        image: your-registry/custom-model:latest
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "2Gi"
            cpu: "1"
        env:
        - name: INFERX_DEVICE
          value: "auto"
        - name: INFERX_BATCH_SIZE
          value: "4"

Best Practices

1. Model Validation

# Always validate optimized model accuracy
def validate_optimized_model(original_model, optimized_model, test_data):
    original_outputs = []
    optimized_outputs = []
    
    for batch in test_data:
        orig_out = original_model(batch)
        opt_out = optimized_model.inference(batch)
        
        original_outputs.append(orig_out)
        optimized_outputs.append(opt_out)
    
    # Calculate accuracy difference
    accuracy_diff = compare_accuracy(original_outputs, optimized_outputs)
    print(f"Accuracy difference: {accuracy_diff:.2%}")
    
    return accuracy_diff < 0.01  # Less than 1% difference

is_valid = validate_optimized_model(original, optimized, validation_data)

2. Progressive Optimization

# Start with conservative optimization
conservative_model = optimize_model(
    model=model,
    optimization_level="conservative"
)

# If performance is good, try more aggressive optimization
if validate_model(conservative_model):
    aggressive_model = optimize_model(
        model=model,
        optimization_level="aggressive"
    )
    
    if validate_model(aggressive_model):
        # Use aggressive optimization
        final_model = aggressive_model
    else:
        # Fall back to conservative
        final_model = conservative_model

3. Hardware-Specific Testing

# Test on multiple hardware configurations
def test_across_hardware(model_path):
    devices = ["cpu", "gpu", "jetson"]
    results = {}
    
    for device in devices:
        try:
            optimized = optimize_model(
                model_path=model_path,
                target_device=device
            )
            
            benchmark = benchmark_model(optimized)
            results[device] = benchmark
            
        except Exception as e:
            print(f"Failed to optimize for {device}: {e}")
            results[device] = None
    
    return results

performance_results = test_across_hardware("my_model.pth")

Troubleshooting

Common Issues

  1. Memory Issues: Reduce batch size or enable gradient checkpointing
  2. Accuracy Loss: Use less aggressive quantization or pruning
  3. Slow Inference: Check if hardware-specific optimizations are enabled
  4. Compatibility Issues: Verify model format and input shapes

Debug Mode

# Enable debug mode for detailed optimization logs
optimized_model = optimize_model(
    model_path="model.pth",
    debug=True,
    verbose=True
)

# Check optimization report
print(optimized_model.optimization_report)

Next Steps

Custom Models with InferX

InferX provides powerful tools to optimize and deploy your own custom models across different hardware platforms. Whether you have a PyTorch, TensorFlow, or ONNX model, InferX can help you achieve optimal performance on any device.

Optimization Pipeline

from inferx.optimize import optimize_model

# Optimize your custom model
optimized_model = optimize_model(
    model_path="path/to/your/model.pth",
    target_device="auto",  # Auto-detects your hardware
    optimization_level="aggressive"
)

# Use the optimized model
result = optimized_model.inference(input_data)

Supported Model Formats

InferX supports multiple model formats and frameworks:

PyTorch Models

import torch
from inferx.optimize import optimize_model

# Load your PyTorch model
model = torch.load("my_model.pth")

# Optimize for current hardware
optimized = optimize_model(
    model=model,
    input_shape=(1, 3, 224, 224),
    optimization_level="balanced"
)

# Deploy with automatic hardware detection
result = optimized.inference(input_tensor)

TensorFlow/Keras Models

import tensorflow as tf
from inferx.optimize import optimize_model

# Load TensorFlow model
model = tf.keras.models.load_model("my_model.h5")

# Optimize for edge deployment
optimized = optimize_model(
    model=model,
    target_device="jetson",
    precision="fp16"  # Use half precision for speed
)

ONNX Models

from inferx.optimize import optimize_model

# Optimize ONNX model
optimized = optimize_model(
    model_path="model.onnx",
    target_device="gpu",
    batch_size=4
)

Hardware-Specific Optimizations

Jetson Optimization

# Optimize specifically for Jetson devices
jetson_model = optimize_model(
    model_path="model.pth",
    target_device="jetson",
    optimization_config={
        "precision": "fp16",
        "max_workspace_size": "1GB",
        "dla_cores": 2,  # Use Deep Learning Accelerator
        "enable_tensorrt": True
    }
)

GPU Optimization

# Optimize for high-end GPUs
gpu_model = optimize_model(
    model_path="model.pth", 
    target_device="gpu",
    optimization_config={
        "precision": "mixed",  # Mixed precision training
        "batch_size": 32,
        "enable_tensorrt": True,
        "enable_cudnn": True
    }
)

CPU Optimization

# Optimize for CPU deployment
cpu_model = optimize_model(
    model_path="model.pth",
    target_device="cpu",
    optimization_config={
        "num_threads": 8,
        "enable_mkldnn": True,
        "quantization": "int8"
    }
)

Advanced Optimization Features

Dynamic Batch Size

# Support variable batch sizes
model = optimize_model(
    model_path="model.pth",
    input_shape=[(1, 3, 224, 224), (4, 3, 224, 224), (8, 3, 224, 224)],
    dynamic_axes={'input': {0: 'batch_size'}}
)

# Use with different batch sizes
single_result = model.inference(single_image)
batch_result = model.inference(batch_images)

Quantization

# Apply quantization for smaller models
quantized_model = optimize_model(
    model_path="large_model.pth",
    quantization_config={
        "method": "dynamic",  # or "static", "qat"
        "precision": "int8",
        "calibration_dataset": calibration_data
    }
)

print(f"Model size reduced by {quantized_model.compression_ratio:.2f}x")

Pruning

# Apply model pruning to reduce parameters
pruned_model = optimize_model(
    model_path="model.pth",
    pruning_config={
        "method": "magnitude",
        "sparsity": 0.5,  # Remove 50% of parameters
        "structured": False
    }
)

Performance Monitoring

Benchmarking

from inferx.benchmark import benchmark_model

# Comprehensive performance analysis
results = benchmark_model(
    model=optimized_model,
    input_shape=(1, 3, 224, 224),
    num_runs=1000,
    warmup_runs=50
)

print(f"Average inference time: {results['avg_time']:.2f}ms")
print(f"Throughput: {results['throughput']:.1f} FPS")
print(f"Memory usage: {results['memory_mb']:.1f}MB")

Real-time Monitoring

import time
import psutil

class ModelMonitor:
    def __init__(self, model):
        self.model = model
        self.metrics = []
    
    def inference_with_monitoring(self, input_data):
        start_time = time.time()
        cpu_before = psutil.cpu_percent()
        memory_before = psutil.virtual_memory().percent
        
        # Run inference
        result = self.model.inference(input_data)
        
        end_time = time.time()
        cpu_after = psutil.cpu_percent()
        memory_after = psutil.virtual_memory().percent
        
        # Log metrics
        metrics = {
            'inference_time': end_time - start_time,
            'cpu_usage': cpu_after - cpu_before,
            'memory_delta': memory_after - memory_before,
            'timestamp': time.time()
        }
        
        self.metrics.append(metrics)
        return result, metrics

# Usage
monitor = ModelMonitor(optimized_model)
result, perf = monitor.inference_with_monitoring(test_input)
print(f"Inference took {perf['inference_time']*1000:.2f}ms")

Example: Complete Custom Model Pipeline

1. Model Training (External)

# Your existing training code
import torch
import torch.nn as nn

class CustomClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.backbone = torchvision.models.resnet18(pretrained=True)
        self.backbone.fc = nn.Linear(512, num_classes)
    
    def forward(self, x):
        return self.backbone(x)

# Train your model
model = CustomClassifier(num_classes=10)
# ... training code ...
torch.save(model.state_dict(), "custom_classifier.pth")

2. InferX Optimization

from inferx.optimize import optimize_model

# Load and optimize the trained model
model_state = torch.load("custom_classifier.pth")
model = CustomClassifier(num_classes=10)
model.load_state_dict(model_state)

# Optimize for deployment
optimized_model = optimize_model(
    model=model,
    input_shape=(1, 3, 224, 224),
    target_device="auto",
    optimization_config={
        "precision": "fp16",
        "enable_tensorrt": True,
        "optimization_level": "aggressive"
    }
)

# Save optimized model
optimized_model.save("custom_classifier_optimized.inferx")

3. Deployment

from inferx.models import load_model
import cv2
import numpy as np

# Load optimized model
model = load_model("custom_classifier_optimized.inferx")

# Preprocess function
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (224, 224))
    image = image.astype(np.float32) / 255.0
    image = np.transpose(image, (2, 0, 1))  # HWC to CHW
    image = np.expand_dims(image, axis=0)   # Add batch dimension
    return image

# Run inference
image = preprocess_image("test_image.jpg")
result = model.inference(image)

print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.3f}")

Production Deployment

Docker Container

FROM inferx/runtime:latest

# Copy your optimized model
COPY custom_classifier_optimized.inferx /app/model/

# Copy application code
COPY app.py /app/

# Install dependencies
RUN pip install opencv-python numpy

# Expose port
EXPOSE 8080

# Run application
CMD ["python", "/app/app.py"]

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: custom-model-service
spec:
  replicas: 3
  selector:
    matchLabels:
      app: custom-model
  template:
    metadata:
      labels:
        app: custom-model
    spec:
      containers:
      - name: inferx-model
        image: your-registry/custom-model:latest
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "2Gi"
            cpu: "1"
        env:
        - name: INFERX_DEVICE
          value: "auto"
        - name: INFERX_BATCH_SIZE
          value: "4"

Best Practices

1. Model Validation

# Always validate optimized model accuracy
def validate_optimized_model(original_model, optimized_model, test_data):
    original_outputs = []
    optimized_outputs = []
    
    for batch in test_data:
        orig_out = original_model(batch)
        opt_out = optimized_model.inference(batch)
        
        original_outputs.append(orig_out)
        optimized_outputs.append(opt_out)
    
    # Calculate accuracy difference
    accuracy_diff = compare_accuracy(original_outputs, optimized_outputs)
    print(f"Accuracy difference: {accuracy_diff:.2%}")
    
    return accuracy_diff < 0.01  # Less than 1% difference

is_valid = validate_optimized_model(original, optimized, validation_data)

2. Progressive Optimization

# Start with conservative optimization
conservative_model = optimize_model(
    model=model,
    optimization_level="conservative"
)

# If performance is good, try more aggressive optimization
if validate_model(conservative_model):
    aggressive_model = optimize_model(
        model=model,
        optimization_level="aggressive"
    )
    
    if validate_model(aggressive_model):
        # Use aggressive optimization
        final_model = aggressive_model
    else:
        # Fall back to conservative
        final_model = conservative_model

3. Hardware-Specific Testing

# Test on multiple hardware configurations
def test_across_hardware(model_path):
    devices = ["cpu", "gpu", "jetson"]
    results = {}
    
    for device in devices:
        try:
            optimized = optimize_model(
                model_path=model_path,
                target_device=device
            )
            
            benchmark = benchmark_model(optimized)
            results[device] = benchmark
            
        except Exception as e:
            print(f"Failed to optimize for {device}: {e}")
            results[device] = None
    
    return results

performance_results = test_across_hardware("my_model.pth")

Troubleshooting

Common Issues

  1. Memory Issues: Reduce batch size or enable gradient checkpointing
  2. Accuracy Loss: Use less aggressive quantization or pruning
  3. Slow Inference: Check if hardware-specific optimizations are enabled
  4. Compatibility Issues: Verify model format and input shapes

Debug Mode

# Enable debug mode for detailed optimization logs
optimized_model = optimize_model(
    model_path="model.pth",
    debug=True,
    verbose=True
)

# Check optimization report
print(optimized_model.optimization_report)

Next Steps