The Edge Computing Challenge

Deploying computer vision models on edge devices—Raspberry Pi, NVIDIA Jetson, smartphones—requires balancing accuracy, speed, and power consumption. You're working with constrained compute, limited memory, and often battery power, yet users expect real-time performance at 30+ FPS.

This guide shares practical techniques I've learned deploying object detection models to hundreds of edge devices across retail, manufacturing, and security applications.

Model Selection for Edge Deployment

Architecture Trade-offs

  • YOLOv8 Nano: 6MB, 37.3 mAP50, 45 FPS on Jetson Nano
  • YOLOv8 Small: 22MB, 44.9 mAP50, 28 FPS on Jetson Nano
  • MobileNet SSD: 19MB, 35.2 mAP50, 30 FPS on Raspberry Pi 4
  • EfficientDet-Lite: 5.5MB, 33.5 mAP50, optimized for mobile

For most edge applications, YOLOv8 Nano provides the best balance. Its compact size and single-stage architecture make it ideal for resource-constrained environments.

Python
from ultralytics import YOLO

# Load YOLOv8 nano model
model = YOLO('yolov8n.pt')

# Validate performance
results = model.val(data='coco.yaml')
print(f"mAP50-95: {results.box.map:.3f}")
print(f"mAP50: {results.box.map50:.3f}")

# Export for edge deployment
model.export(format='onnx', simplify=True)

Quantization: INT8 for 4x Speedup

Quantization reduces model precision from FP32 to INT8, cutting model size by 75% and speeding up inference 2-4x with minimal accuracy loss (typically <2% mAP drop).

Post-Training Quantization

Python
import torch
import onnx
from onnxruntime.quantization import quantize_dynamic

# Export to ONNX first
model = YOLO('yolov8n.pt')
model.export(format='onnx')

# Dynamic quantization (easiest method)
quantize_dynamic(
    model_input='yolov8n.onnx',
    model_output='yolov8n_int8.onnx',
    weight_type=onnx.QuantType.QInt8
)

# For better results: Static quantization with calibration data
from onnxruntime.quantization import quantize_static, CalibrationDataReader

class ImageDataReader(CalibrationDataReader):
    def __init__(self, calibration_images):
        self.images = calibration_images
        self.current = 0
    
    def get_next(self):
        if self.current >= len(self.images):
            return None
        img = self.preprocess(self.images[self.current])
        self.current += 1
        return {"images": img}
    
    def rewind(self):
        self.current = 0

# Use representative calibration data
calibration_reader = ImageDataReader(calibration_images)

quantize_static(
    model_input='yolov8n.onnx',
    model_output='yolov8n_int8_static.onnx',
    calibration_data_reader=calibration_reader
)

TensorRT Optimization for NVIDIA Devices

NVIDIA TensorRT provides dramatic speedups on Jetson devices through layer fusion, kernel auto-tuning, and precision calibration.

Python
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np

class TRTDetector:
    """TensorRT inference wrapper for YOLO."""
    
    def __init__(self, engine_path):
        # Load TensorRT engine
        self.logger = trt.Logger(trt.Logger.WARNING)
        with open(engine_path, 'rb') as f:
            runtime = trt.Runtime(self.logger)
            self.engine = runtime.deserialize_cuda_engine(f.read())
        
        self.context = self.engine.create_execution_context()
        
        # Allocate buffers
        self.allocate_buffers()
    
    def allocate_buffers(self):
        """Allocate GPU memory for input/output."""
        self.inputs = []
        self.outputs = []
        self.bindings = []
        
        for i in range(self.engine.num_io_tensors):
            tensor_name = self.engine.get_tensor_name(i)
            size = trt.volume(self.engine.get_tensor_shape(tensor_name))
            dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
            
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            self.bindings.append(int(device_mem))
            
            if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
                self.inputs.append({'host': host_mem, 'device': device_mem})
            else:
                self.outputs.append({'host': host_mem, 'device': device_mem})
    
    def infer(self, image):
        """Run inference on image."""
        # Preprocess
        input_data = self.preprocess(image)
        
        # Copy to GPU
        np.copyto(self.inputs[0]['host'], input_data.ravel())
        cuda.memcpy_htod(self.inputs[0]['device'], self.inputs[0]['host'])
        
        # Execute
        self.context.execute_v2(bindings=self.bindings)
        
        # Copy from GPU
        cuda.memcpy_dtoh(self.outputs[0]['host'], self.outputs[0]['device'])
        
        # Postprocess
        return self.postprocess(self.outputs[0]['host'])
    
    def preprocess(self, image):
        """Resize and normalize image."""
        resized = cv2.resize(image, (640, 640))
        normalized = resized.astype(np.float32) / 255.0
        transposed = np.transpose(normalized, (2, 0, 1))
        batched = np.expand_dims(transposed, axis=0)
        return np.ascontiguousarray(batched)
    
    def postprocess(self, output):
        """Apply NMS and return detections."""
        # Parse YOLO output format
        boxes = output[:, :4]
        scores = output[:, 4]
        classes = output[:, 5]
        
        # Filter by confidence
        mask = scores > 0.5
        return {
            'boxes': boxes[mask],
            'scores': scores[mask],
            'classes': classes[mask]
        }

# Convert ONNX to TensorRT
def build_trt_engine(onnx_path, engine_path, fp16=True):
    """Build TensorRT engine from ONNX model."""
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)
    
    # Parse ONNX
    with open(onnx_path, 'rb') as model:
        if not parser.parse(model.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    
    # Build config
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB
    
    # Enable FP16 for Jetson
    if fp16 and builder.platform_has_fast_fp16:
        config.set_flag(trt.BuilderFlag.FP16)
    
    # Build and save
    engine = builder.build_serialized_network(network, config)
    with open(engine_path, 'wb') as f:
        f.write(engine)
    
    return engine

# Usage
build_trt_engine('yolov8n.onnx', 'yolov8n.trt', fp16=True)
detector = TRTDetector('yolov8n.trt')

Multi-threaded Real-time Pipeline

Separate capture, inference, and display into threads for maximum throughput:

Python
import cv2
import threading
import queue
from collections import deque
import time

class RealtimeDetector:
    """Multi-threaded real-time object detection."""
    
    def __init__(self, model_path, camera_id=0):
        self.model = TRTDetector(model_path)
        self.camera_id = camera_id
        
        # Thread-safe queues
        self.frame_queue = queue.Queue(maxsize=2)
        self.result_queue = queue.Queue(maxsize=2)
        
        # FPS tracking
        self.fps_buffer = deque(maxlen=30)
        self.running = False
    
    def capture_thread(self):
        """Capture frames from camera."""
        cap = cv2.VideoCapture(self.camera_id)
        cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)  # Minimize lag
        
        while self.running:
            ret, frame = cap.read()
            if not ret:
                continue
            
            # Drop old frames
            if self.frame_queue.full():
                try:
                    self.frame_queue.get_nowait()
                except queue.Empty:
                    pass
            
            self.frame_queue.put(frame)
        
        cap.release()
    
    def inference_thread(self):
        """Run detection on frames."""
        while self.running:
            try:
                frame = self.frame_queue.get(timeout=1)
            except queue.Empty:
                continue
            
            # Detect objects
            start = time.time()
            detections = self.model.infer(frame)
            inference_time = time.time() - start
            
            # Calculate FPS
            fps = 1.0 / inference_time
            self.fps_buffer.append(fps)
            
            # Store result
            result = {
                'frame': frame,
                'detections': detections,
                'fps': np.mean(self.fps_buffer)
            }
            
            if self.result_queue.full():
                try:
                    self.result_queue.get_nowait()
                except queue.Empty:
                    pass
            
            self.result_queue.put(result)
    
    def display_thread(self):
        """Display results."""
        while self.running:
            try:
                result = self.result_queue.get(timeout=1)
            except queue.Empty:
                continue
            
            # Draw detections
            frame = self.draw_boxes(
                result['frame'],
                result['detections'],
                result['fps']
            )
            
            cv2.imshow('Detection', frame)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                self.running = False
        
        cv2.destroyAllWindows()
    
    def draw_boxes(self, frame, detections, fps):
        """Draw bounding boxes on frame."""
        for box, score, cls in zip(
            detections['boxes'],
            detections['scores'],
            detections['classes']
        ):
            x1, y1, x2, y2 = map(int, box)
            
            # Draw box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
            # Draw label
            label = f"Class {int(cls)}: {score:.2f}"
            cv2.putText(frame, label, (x1, y1 - 10),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Draw FPS
        cv2.putText(frame, f"FPS: {fps:.1f}", (10, 30),
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        return frame
    
    def run(self):
        """Start detection pipeline."""
        self.running = True
        
        # Start threads
        threads = [
            threading.Thread(target=self.capture_thread),
            threading.Thread(target=self.inference_thread),
            threading.Thread(target=self.display_thread)
        ]
        
        for t in threads:
            t.start()
        
        for t in threads:
            t.join()

# Run detector
detector = RealtimeDetector('yolov8n.trt', camera_id=0)
detector.run()

Optimization Techniques

💡 Performance Tips

  • Input resolution: Use 416x416 instead of 640x640 for 2.4x speedup
  • Confidence threshold: Increase to 0.5+ to reduce post-processing
  • NMS threshold: 0.45 balances accuracy and speed
  • Max detections: Limit to 100 objects per frame
  • Frame skipping: Process every 2nd frame for 2x throughput
  • ROI processing: Detect only in regions of interest
  • Batch processing: Process multiple frames together on GPU

Power Management for Battery Devices

Python
class AdaptiveDetector:
    """Adapt performance based on thermal/power state."""
    
    def __init__(self):
        self.modes = {
            'high': {'resolution': 640, 'fps': 30, 'confidence': 0.5},
            'medium': {'resolution': 416, 'fps': 20, 'confidence': 0.6},
            'low': {'resolution': 320, 'fps': 10, 'confidence': 0.7}
        }
        self.current_mode = 'high'
    
    def check_thermal(self):
        """Read device temperature."""
        try:
            with open('/sys/class/thermal/thermal_zone0/temp') as f:
                temp = float(f.read()) / 1000.0
            return temp
        except:
            return 50.0  # Default
    
    def adjust_performance(self):
        """Switch modes based on temperature."""
        temp = self.check_thermal()
        
        if temp > 80:
            self.current_mode = 'low'
        elif temp > 70:
            self.current_mode = 'medium'
        else:
            self.current_mode = 'high'
        
        return self.modes[self.current_mode]

Benchmarking Your Deployment

Python
def benchmark_model(model, num_runs=100):
    """Benchmark model performance."""
    # Warmup
    dummy_input = np.random.randn(1, 3, 640, 640).astype(np.float32)
    for _ in range(10):
        _ = model.infer(dummy_input)
    
    # Benchmark
    times = []
    for _ in range(num_runs):
        start = time.time()
        _ = model.infer(dummy_input)
        times.append(time.time() - start)
    
    times = np.array(times) * 1000  # to ms
    
    print(f"Mean: {np.mean(times):.2f} ms")
    print(f"Std: {np.std(times):.2f} ms")
    print(f"P50: {np.percentile(times, 50):.2f} ms")
    print(f"P95: {np.percentile(times, 95):.2f} ms")
    print(f"FPS: {1000 / np.mean(times):.1f}")

# Compare optimizations
print("FP32 ONNX:")
benchmark_model(onnx_model)

print("\nINT8 Quantized:")
benchmark_model(quantized_model)

print("\nTensorRT FP16:")
benchmark_model(trt_model)

Conclusion

Edge deployment requires systematic optimization: choose lightweight architectures, apply quantization, leverage hardware acceleration with TensorRT, and implement efficient pipelines.

The key is balancing the accuracy-speed-power triangle for your specific application. Not every use case needs 60 FPS at 640x640—often 30 FPS at 416x416 hits the sweet spot.

Resources

  • Ultralytics YOLOv8: https://docs.ultralytics.com/
  • NVIDIA TensorRT: https://docs.nvidia.com/deeplearning/tensorrt/
  • ONNX Runtime: https://onnxruntime.ai/
  • Jetson AI Courses: https://developer.nvidia.com/embedded/learn/jetson-ai-certification