The CLIP (Contrastive Language-Image Pretraining) model is a powerful multimodal model that connects text and images. With InferX, you can run CLIP on any device using the same API - whether it’s a Jetson, GPU server, or CPU-only system.
from inferx.models.clip import clipimport json# Initialize the model (automatically detects your hardware)model = clip()# Run inferenceresults = model.inference( image_paths=["path/to/image1.jpg", "path/to/image2.jpg"], text_queries=["a photo of a dog", "a photo of a cat", "a photo of a bird"])# Print resultsprint(json.dumps(results, indent=2))
from inferx.models.clip import clip# Process a list of imagesimages = [ "path/to/image1.jpg", "path/to/image2.jpg", "path/to/image3.jpg"]# Or load images from a text file (one path per line)images = "path/to/image_list.txt"model = clip()results = model.inference( image_paths=images, text_queries=["query1", "query2", "query3"])
from inferx.models.clip import clipimport os# Initialize modelmodel = clip()# Process directory of imagesimage_directory = "path/to/images/"image_paths = [ os.path.join(image_directory, f) for f in os.listdir(image_directory) if f.endswith(('.jpg', '.png', '.jpeg'))]text_queries = [ "a photo of a dog", "a photo of a cat", "a landscape photo", "a person walking"]results = model.inference( image_paths=image_paths, text_queries=text_queries)