How to optimize and deploy your own models with InferX
InferX provides powerful tools to optimize and deploy your own custom models across different hardware platforms. Whether you have a PyTorch, TensorFlow, or ONNX model, InferX can help you achieve optimal performance on any device.
from inferx.optimize import optimize_model
# Optimize your custom model
optimized_model = optimize_model(
model_path="path/to/your/model.pth",
target_device="auto", # Auto-detects your hardware
optimization_level="aggressive"
)
# Use the optimized model
result = optimized_model.inference(input_data)
InferX supports multiple model formats and frameworks:
import torch
from inferx.optimize import optimize_model
# Load your PyTorch model
model = torch.load("my_model.pth")
# Optimize for current hardware
optimized = optimize_model(
model=model,
input_shape=(1, 3, 224, 224),
optimization_level="balanced"
)
# Deploy with automatic hardware detection
result = optimized.inference(input_tensor)
import tensorflow as tf
from inferx.optimize import optimize_model
# Load TensorFlow model
model = tf.keras.models.load_model("my_model.h5")
# Optimize for edge deployment
optimized = optimize_model(
model=model,
target_device="jetson",
precision="fp16" # Use half precision for speed
)
from inferx.optimize import optimize_model
# Optimize ONNX model
optimized = optimize_model(
model_path="model.onnx",
target_device="gpu",
batch_size=4
)
# Optimize specifically for Jetson devices
jetson_model = optimize_model(
model_path="model.pth",
target_device="jetson",
optimization_config={
"precision": "fp16",
"max_workspace_size": "1GB",
"dla_cores": 2, # Use Deep Learning Accelerator
"enable_tensorrt": True
}
)
# Optimize for high-end GPUs
gpu_model = optimize_model(
model_path="model.pth",
target_device="gpu",
optimization_config={
"precision": "mixed", # Mixed precision training
"batch_size": 32,
"enable_tensorrt": True,
"enable_cudnn": True
}
)
# Optimize for CPU deployment
cpu_model = optimize_model(
model_path="model.pth",
target_device="cpu",
optimization_config={
"num_threads": 8,
"enable_mkldnn": True,
"quantization": "int8"
}
)
# Support variable batch sizes
model = optimize_model(
model_path="model.pth",
input_shape=[(1, 3, 224, 224), (4, 3, 224, 224), (8, 3, 224, 224)],
dynamic_axes={'input': {0: 'batch_size'}}
)
# Use with different batch sizes
single_result = model.inference(single_image)
batch_result = model.inference(batch_images)
# Apply quantization for smaller models
quantized_model = optimize_model(
model_path="large_model.pth",
quantization_config={
"method": "dynamic", # or "static", "qat"
"precision": "int8",
"calibration_dataset": calibration_data
}
)
print(f"Model size reduced by {quantized_model.compression_ratio:.2f}x")
# Apply model pruning to reduce parameters
pruned_model = optimize_model(
model_path="model.pth",
pruning_config={
"method": "magnitude",
"sparsity": 0.5, # Remove 50% of parameters
"structured": False
}
)
from inferx.benchmark import benchmark_model
# Comprehensive performance analysis
results = benchmark_model(
model=optimized_model,
input_shape=(1, 3, 224, 224),
num_runs=1000,
warmup_runs=50
)
print(f"Average inference time: {results['avg_time']:.2f}ms")
print(f"Throughput: {results['throughput']:.1f} FPS")
print(f"Memory usage: {results['memory_mb']:.1f}MB")
import time
import psutil
class ModelMonitor:
def __init__(self, model):
self.model = model
self.metrics = []
def inference_with_monitoring(self, input_data):
start_time = time.time()
cpu_before = psutil.cpu_percent()
memory_before = psutil.virtual_memory().percent
# Run inference
result = self.model.inference(input_data)
end_time = time.time()
cpu_after = psutil.cpu_percent()
memory_after = psutil.virtual_memory().percent
# Log metrics
metrics = {
'inference_time': end_time - start_time,
'cpu_usage': cpu_after - cpu_before,
'memory_delta': memory_after - memory_before,
'timestamp': time.time()
}
self.metrics.append(metrics)
return result, metrics
# Usage
monitor = ModelMonitor(optimized_model)
result, perf = monitor.inference_with_monitoring(test_input)
print(f"Inference took {perf['inference_time']*1000:.2f}ms")
# Your existing training code
import torch
import torch.nn as nn
class CustomClassifier(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.backbone = torchvision.models.resnet18(pretrained=True)
self.backbone.fc = nn.Linear(512, num_classes)
def forward(self, x):
return self.backbone(x)
# Train your model
model = CustomClassifier(num_classes=10)
# ... training code ...
torch.save(model.state_dict(), "custom_classifier.pth")
from inferx.optimize import optimize_model
# Load and optimize the trained model
model_state = torch.load("custom_classifier.pth")
model = CustomClassifier(num_classes=10)
model.load_state_dict(model_state)
# Optimize for deployment
optimized_model = optimize_model(
model=model,
input_shape=(1, 3, 224, 224),
target_device="auto",
optimization_config={
"precision": "fp16",
"enable_tensorrt": True,
"optimization_level": "aggressive"
}
)
# Save optimized model
optimized_model.save("custom_classifier_optimized.inferx")
from inferx.models import load_model
import cv2
import numpy as np
# Load optimized model
model = load_model("custom_classifier_optimized.inferx")
# Preprocess function
def preprocess_image(image_path):
image = cv2.imread(image_path)
image = cv2.resize(image, (224, 224))
image = image.astype(np.float32) / 255.0
image = np.transpose(image, (2, 0, 1)) # HWC to CHW
image = np.expand_dims(image, axis=0) # Add batch dimension
return image
# Run inference
image = preprocess_image("test_image.jpg")
result = model.inference(image)
print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.3f}")
FROM inferx/runtime:latest
# Copy your optimized model
COPY custom_classifier_optimized.inferx /app/model/
# Copy application code
COPY app.py /app/
# Install dependencies
RUN pip install opencv-python numpy
# Expose port
EXPOSE 8080
# Run application
CMD ["python", "/app/app.py"]
apiVersion: apps/v1
kind: Deployment
metadata:
name: custom-model-service
spec:
replicas: 3
selector:
matchLabels:
app: custom-model
template:
metadata:
labels:
app: custom-model
spec:
containers:
- name: inferx-model
image: your-registry/custom-model:latest
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "1"
env:
- name: INFERX_DEVICE
value: "auto"
- name: INFERX_BATCH_SIZE
value: "4"
# Always validate optimized model accuracy
def validate_optimized_model(original_model, optimized_model, test_data):
original_outputs = []
optimized_outputs = []
for batch in test_data:
orig_out = original_model(batch)
opt_out = optimized_model.inference(batch)
original_outputs.append(orig_out)
optimized_outputs.append(opt_out)
# Calculate accuracy difference
accuracy_diff = compare_accuracy(original_outputs, optimized_outputs)
print(f"Accuracy difference: {accuracy_diff:.2%}")
return accuracy_diff < 0.01 # Less than 1% difference
is_valid = validate_optimized_model(original, optimized, validation_data)
# Start with conservative optimization
conservative_model = optimize_model(
model=model,
optimization_level="conservative"
)
# If performance is good, try more aggressive optimization
if validate_model(conservative_model):
aggressive_model = optimize_model(
model=model,
optimization_level="aggressive"
)
if validate_model(aggressive_model):
# Use aggressive optimization
final_model = aggressive_model
else:
# Fall back to conservative
final_model = conservative_model
# Test on multiple hardware configurations
def test_across_hardware(model_path):
devices = ["cpu", "gpu", "jetson"]
results = {}
for device in devices:
try:
optimized = optimize_model(
model_path=model_path,
target_device=device
)
benchmark = benchmark_model(optimized)
results[device] = benchmark
except Exception as e:
print(f"Failed to optimize for {device}: {e}")
results[device] = None
return results
performance_results = test_across_hardware("my_model.pth")
# Enable debug mode for detailed optimization logs
optimized_model = optimize_model(
model_path="model.pth",
debug=True,
verbose=True
)
# Check optimization report
print(optimized_model.optimization_report)
How to optimize and deploy your own models with InferX
InferX provides powerful tools to optimize and deploy your own custom models across different hardware platforms. Whether you have a PyTorch, TensorFlow, or ONNX model, InferX can help you achieve optimal performance on any device.
from inferx.optimize import optimize_model
# Optimize your custom model
optimized_model = optimize_model(
model_path="path/to/your/model.pth",
target_device="auto", # Auto-detects your hardware
optimization_level="aggressive"
)
# Use the optimized model
result = optimized_model.inference(input_data)
InferX supports multiple model formats and frameworks:
import torch
from inferx.optimize import optimize_model
# Load your PyTorch model
model = torch.load("my_model.pth")
# Optimize for current hardware
optimized = optimize_model(
model=model,
input_shape=(1, 3, 224, 224),
optimization_level="balanced"
)
# Deploy with automatic hardware detection
result = optimized.inference(input_tensor)
import tensorflow as tf
from inferx.optimize import optimize_model
# Load TensorFlow model
model = tf.keras.models.load_model("my_model.h5")
# Optimize for edge deployment
optimized = optimize_model(
model=model,
target_device="jetson",
precision="fp16" # Use half precision for speed
)
from inferx.optimize import optimize_model
# Optimize ONNX model
optimized = optimize_model(
model_path="model.onnx",
target_device="gpu",
batch_size=4
)
# Optimize specifically for Jetson devices
jetson_model = optimize_model(
model_path="model.pth",
target_device="jetson",
optimization_config={
"precision": "fp16",
"max_workspace_size": "1GB",
"dla_cores": 2, # Use Deep Learning Accelerator
"enable_tensorrt": True
}
)
# Optimize for high-end GPUs
gpu_model = optimize_model(
model_path="model.pth",
target_device="gpu",
optimization_config={
"precision": "mixed", # Mixed precision training
"batch_size": 32,
"enable_tensorrt": True,
"enable_cudnn": True
}
)
# Optimize for CPU deployment
cpu_model = optimize_model(
model_path="model.pth",
target_device="cpu",
optimization_config={
"num_threads": 8,
"enable_mkldnn": True,
"quantization": "int8"
}
)
# Support variable batch sizes
model = optimize_model(
model_path="model.pth",
input_shape=[(1, 3, 224, 224), (4, 3, 224, 224), (8, 3, 224, 224)],
dynamic_axes={'input': {0: 'batch_size'}}
)
# Use with different batch sizes
single_result = model.inference(single_image)
batch_result = model.inference(batch_images)
# Apply quantization for smaller models
quantized_model = optimize_model(
model_path="large_model.pth",
quantization_config={
"method": "dynamic", # or "static", "qat"
"precision": "int8",
"calibration_dataset": calibration_data
}
)
print(f"Model size reduced by {quantized_model.compression_ratio:.2f}x")
# Apply model pruning to reduce parameters
pruned_model = optimize_model(
model_path="model.pth",
pruning_config={
"method": "magnitude",
"sparsity": 0.5, # Remove 50% of parameters
"structured": False
}
)
from inferx.benchmark import benchmark_model
# Comprehensive performance analysis
results = benchmark_model(
model=optimized_model,
input_shape=(1, 3, 224, 224),
num_runs=1000,
warmup_runs=50
)
print(f"Average inference time: {results['avg_time']:.2f}ms")
print(f"Throughput: {results['throughput']:.1f} FPS")
print(f"Memory usage: {results['memory_mb']:.1f}MB")
import time
import psutil
class ModelMonitor:
def __init__(self, model):
self.model = model
self.metrics = []
def inference_with_monitoring(self, input_data):
start_time = time.time()
cpu_before = psutil.cpu_percent()
memory_before = psutil.virtual_memory().percent
# Run inference
result = self.model.inference(input_data)
end_time = time.time()
cpu_after = psutil.cpu_percent()
memory_after = psutil.virtual_memory().percent
# Log metrics
metrics = {
'inference_time': end_time - start_time,
'cpu_usage': cpu_after - cpu_before,
'memory_delta': memory_after - memory_before,
'timestamp': time.time()
}
self.metrics.append(metrics)
return result, metrics
# Usage
monitor = ModelMonitor(optimized_model)
result, perf = monitor.inference_with_monitoring(test_input)
print(f"Inference took {perf['inference_time']*1000:.2f}ms")
# Your existing training code
import torch
import torch.nn as nn
class CustomClassifier(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.backbone = torchvision.models.resnet18(pretrained=True)
self.backbone.fc = nn.Linear(512, num_classes)
def forward(self, x):
return self.backbone(x)
# Train your model
model = CustomClassifier(num_classes=10)
# ... training code ...
torch.save(model.state_dict(), "custom_classifier.pth")
from inferx.optimize import optimize_model
# Load and optimize the trained model
model_state = torch.load("custom_classifier.pth")
model = CustomClassifier(num_classes=10)
model.load_state_dict(model_state)
# Optimize for deployment
optimized_model = optimize_model(
model=model,
input_shape=(1, 3, 224, 224),
target_device="auto",
optimization_config={
"precision": "fp16",
"enable_tensorrt": True,
"optimization_level": "aggressive"
}
)
# Save optimized model
optimized_model.save("custom_classifier_optimized.inferx")
from inferx.models import load_model
import cv2
import numpy as np
# Load optimized model
model = load_model("custom_classifier_optimized.inferx")
# Preprocess function
def preprocess_image(image_path):
image = cv2.imread(image_path)
image = cv2.resize(image, (224, 224))
image = image.astype(np.float32) / 255.0
image = np.transpose(image, (2, 0, 1)) # HWC to CHW
image = np.expand_dims(image, axis=0) # Add batch dimension
return image
# Run inference
image = preprocess_image("test_image.jpg")
result = model.inference(image)
print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.3f}")
FROM inferx/runtime:latest
# Copy your optimized model
COPY custom_classifier_optimized.inferx /app/model/
# Copy application code
COPY app.py /app/
# Install dependencies
RUN pip install opencv-python numpy
# Expose port
EXPOSE 8080
# Run application
CMD ["python", "/app/app.py"]
apiVersion: apps/v1
kind: Deployment
metadata:
name: custom-model-service
spec:
replicas: 3
selector:
matchLabels:
app: custom-model
template:
metadata:
labels:
app: custom-model
spec:
containers:
- name: inferx-model
image: your-registry/custom-model:latest
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "1"
env:
- name: INFERX_DEVICE
value: "auto"
- name: INFERX_BATCH_SIZE
value: "4"
# Always validate optimized model accuracy
def validate_optimized_model(original_model, optimized_model, test_data):
original_outputs = []
optimized_outputs = []
for batch in test_data:
orig_out = original_model(batch)
opt_out = optimized_model.inference(batch)
original_outputs.append(orig_out)
optimized_outputs.append(opt_out)
# Calculate accuracy difference
accuracy_diff = compare_accuracy(original_outputs, optimized_outputs)
print(f"Accuracy difference: {accuracy_diff:.2%}")
return accuracy_diff < 0.01 # Less than 1% difference
is_valid = validate_optimized_model(original, optimized, validation_data)
# Start with conservative optimization
conservative_model = optimize_model(
model=model,
optimization_level="conservative"
)
# If performance is good, try more aggressive optimization
if validate_model(conservative_model):
aggressive_model = optimize_model(
model=model,
optimization_level="aggressive"
)
if validate_model(aggressive_model):
# Use aggressive optimization
final_model = aggressive_model
else:
# Fall back to conservative
final_model = conservative_model
# Test on multiple hardware configurations
def test_across_hardware(model_path):
devices = ["cpu", "gpu", "jetson"]
results = {}
for device in devices:
try:
optimized = optimize_model(
model_path=model_path,
target_device=device
)
benchmark = benchmark_model(optimized)
results[device] = benchmark
except Exception as e:
print(f"Failed to optimize for {device}: {e}")
results[device] = None
return results
performance_results = test_across_hardware("my_model.pth")
# Enable debug mode for detailed optimization logs
optimized_model = optimize_model(
model_path="model.pth",
debug=True,
verbose=True
)
# Check optimization report
print(optimized_model.optimization_report)