Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.exla.ai/llms.txt

Use this file to discover all available pages before exploring further.

Custom Models with InferX

InferX provides powerful tools to optimize and deploy your own custom models across different hardware platforms. Whether you have a PyTorch, TensorFlow, or ONNX model, InferX can help you achieve optimal performance on any device.

Optimization Pipeline

from inferx.optimize import optimize_model

# Optimize your custom model
optimized_model = optimize_model(
    model_path="path/to/your/model.pth",
    target_device="auto",  # Auto-detects your hardware
    optimization_level="aggressive"
)

# Use the optimized model
result = optimized_model.inference(input_data)

Supported Model Formats

InferX supports multiple model formats and frameworks:

PyTorch Models

import torch
from inferx.optimize import optimize_model

# Load your PyTorch model
model = torch.load("my_model.pth")

# Optimize for current hardware
optimized = optimize_model(
    model=model,
    input_shape=(1, 3, 224, 224),
    optimization_level="balanced"
)

# Deploy with automatic hardware detection
result = optimized.inference(input_tensor)

TensorFlow/Keras Models

import tensorflow as tf
from inferx.optimize import optimize_model

# Load TensorFlow model
model = tf.keras.models.load_model("my_model.h5")

# Optimize for edge deployment
optimized = optimize_model(
    model=model,
    target_device="jetson",
    precision="fp16"  # Use half precision for speed
)

ONNX Models

from inferx.optimize import optimize_model

# Optimize ONNX model
optimized = optimize_model(
    model_path="model.onnx",
    target_device="gpu",
    batch_size=4
)

Hardware-Specific Optimizations

Jetson Optimization

# Optimize specifically for Jetson devices
jetson_model = optimize_model(
    model_path="model.pth",
    target_device="jetson",
    optimization_config={
        "precision": "fp16",
        "max_workspace_size": "1GB",
        "dla_cores": 2,  # Use Deep Learning Accelerator
        "enable_tensorrt": True
    }
)

GPU Optimization

# Optimize for high-end GPUs
gpu_model = optimize_model(
    model_path="model.pth", 
    target_device="gpu",
    optimization_config={
        "precision": "mixed",  # Mixed precision training
        "batch_size": 32,
        "enable_tensorrt": True,
        "enable_cudnn": True
    }
)

CPU Optimization

# Optimize for CPU deployment
cpu_model = optimize_model(
    model_path="model.pth",
    target_device="cpu",
    optimization_config={
        "num_threads": 8,
        "enable_mkldnn": True,
        "quantization": "int8"
    }
)

Advanced Optimization Features

Dynamic Batch Size

# Support variable batch sizes
model = optimize_model(
    model_path="model.pth",
    input_shape=[(1, 3, 224, 224), (4, 3, 224, 224), (8, 3, 224, 224)],
    dynamic_axes={'input': {0: 'batch_size'}}
)

# Use with different batch sizes
single_result = model.inference(single_image)
batch_result = model.inference(batch_images)

Quantization

# Apply quantization for smaller models
quantized_model = optimize_model(
    model_path="large_model.pth",
    quantization_config={
        "method": "dynamic",  # or "static", "qat"
        "precision": "int8",
        "calibration_dataset": calibration_data
    }
)

print(f"Model size reduced by {quantized_model.compression_ratio:.2f}x")

Pruning

# Apply model pruning to reduce parameters
pruned_model = optimize_model(
    model_path="model.pth",
    pruning_config={
        "method": "magnitude",
        "sparsity": 0.5,  # Remove 50% of parameters
        "structured": False
    }
)

Performance Monitoring

Benchmarking

from inferx.benchmark import benchmark_model

# Comprehensive performance analysis
results = benchmark_model(
    model=optimized_model,
    input_shape=(1, 3, 224, 224),
    num_runs=1000,
    warmup_runs=50
)

print(f"Average inference time: {results['avg_time']:.2f}ms")
print(f"Throughput: {results['throughput']:.1f} FPS")
print(f"Memory usage: {results['memory_mb']:.1f}MB")

Real-time Monitoring

import time
import psutil

class ModelMonitor:
    def __init__(self, model):
        self.model = model
        self.metrics = []
    
    def inference_with_monitoring(self, input_data):
        start_time = time.time()
        cpu_before = psutil.cpu_percent()
        memory_before = psutil.virtual_memory().percent
        
        # Run inference
        result = self.model.inference(input_data)
        
        end_time = time.time()
        cpu_after = psutil.cpu_percent()
        memory_after = psutil.virtual_memory().percent
        
        # Log metrics
        metrics = {
            'inference_time': end_time - start_time,
            'cpu_usage': cpu_after - cpu_before,
            'memory_delta': memory_after - memory_before,
            'timestamp': time.time()
        }
        
        self.metrics.append(metrics)
        return result, metrics

# Usage
monitor = ModelMonitor(optimized_model)
result, perf = monitor.inference_with_monitoring(test_input)
print(f"Inference took {perf['inference_time']*1000:.2f}ms")

Example: Complete Custom Model Pipeline

1. Model Training (External)

# Your existing training code
import torch
import torch.nn as nn

class CustomClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.backbone = torchvision.models.resnet18(pretrained=True)
        self.backbone.fc = nn.Linear(512, num_classes)
    
    def forward(self, x):
        return self.backbone(x)

# Train your model
model = CustomClassifier(num_classes=10)
# ... training code ...
torch.save(model.state_dict(), "custom_classifier.pth")

2. InferX Optimization

from inferx.optimize import optimize_model

# Load and optimize the trained model
model_state = torch.load("custom_classifier.pth")
model = CustomClassifier(num_classes=10)
model.load_state_dict(model_state)

# Optimize for deployment
optimized_model = optimize_model(
    model=model,
    input_shape=(1, 3, 224, 224),
    target_device="auto",
    optimization_config={
        "precision": "fp16",
        "enable_tensorrt": True,
        "optimization_level": "aggressive"
    }
)

# Save optimized model
optimized_model.save("custom_classifier_optimized.inferx")

3. Deployment

from inferx.models import load_model
import cv2
import numpy as np

# Load optimized model
model = load_model("custom_classifier_optimized.inferx")

# Preprocess function
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (224, 224))
    image = image.astype(np.float32) / 255.0
    image = np.transpose(image, (2, 0, 1))  # HWC to CHW
    image = np.expand_dims(image, axis=0)   # Add batch dimension
    return image

# Run inference
image = preprocess_image("test_image.jpg")
result = model.inference(image)

print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.3f}")

Production Deployment

Docker Container

FROM inferx/runtime:latest

# Copy your optimized model
COPY custom_classifier_optimized.inferx /app/model/

# Copy application code
COPY app.py /app/

# Install dependencies
RUN pip install opencv-python numpy

# Expose port
EXPOSE 8080

# Run application
CMD ["python", "/app/app.py"]

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: custom-model-service
spec:
  replicas: 3
  selector:
    matchLabels:
      app: custom-model
  template:
    metadata:
      labels:
        app: custom-model
    spec:
      containers:
      - name: inferx-model
        image: your-registry/custom-model:latest
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "2Gi"
            cpu: "1"
        env:
        - name: INFERX_DEVICE
          value: "auto"
        - name: INFERX_BATCH_SIZE
          value: "4"

Best Practices

1. Model Validation

# Always validate optimized model accuracy
def validate_optimized_model(original_model, optimized_model, test_data):
    original_outputs = []
    optimized_outputs = []
    
    for batch in test_data:
        orig_out = original_model(batch)
        opt_out = optimized_model.inference(batch)
        
        original_outputs.append(orig_out)
        optimized_outputs.append(opt_out)
    
    # Calculate accuracy difference
    accuracy_diff = compare_accuracy(original_outputs, optimized_outputs)
    print(f"Accuracy difference: {accuracy_diff:.2%}")
    
    return accuracy_diff < 0.01  # Less than 1% difference

is_valid = validate_optimized_model(original, optimized, validation_data)

2. Progressive Optimization

# Start with conservative optimization
conservative_model = optimize_model(
    model=model,
    optimization_level="conservative"
)

# If performance is good, try more aggressive optimization
if validate_model(conservative_model):
    aggressive_model = optimize_model(
        model=model,
        optimization_level="aggressive"
    )
    
    if validate_model(aggressive_model):
        # Use aggressive optimization
        final_model = aggressive_model
    else:
        # Fall back to conservative
        final_model = conservative_model

3. Hardware-Specific Testing

# Test on multiple hardware configurations
def test_across_hardware(model_path):
    devices = ["cpu", "gpu", "jetson"]
    results = {}
    
    for device in devices:
        try:
            optimized = optimize_model(
                model_path=model_path,
                target_device=device
            )
            
            benchmark = benchmark_model(optimized)
            results[device] = benchmark
            
        except Exception as e:
            print(f"Failed to optimize for {device}: {e}")
            results[device] = None
    
    return results

performance_results = test_across_hardware("my_model.pth")

Troubleshooting

Common Issues

  1. Memory Issues: Reduce batch size or enable gradient checkpointing
  2. Accuracy Loss: Use less aggressive quantization or pruning
  3. Slow Inference: Check if hardware-specific optimizations are enabled
  4. Compatibility Issues: Verify model format and input shapes

Debug Mode

# Enable debug mode for detailed optimization logs
optimized_model = optimize_model(
    model_path="model.pth",
    debug=True,
    verbose=True
)

# Check optimization report
print(optimized_model.optimization_report)

Next Steps