The Edge Computing Challenge
Deploying computer vision models on edge devices—Raspberry Pi, NVIDIA Jetson, smartphones—requires balancing accuracy, speed, and power consumption. You're working with constrained compute, limited memory, and often battery power, yet users expect real-time performance at 30+ FPS.
This guide shares practical techniques I've learned deploying object detection models to hundreds of edge devices across retail, manufacturing, and security applications.
Model Selection for Edge Deployment
Architecture Trade-offs
- YOLOv8 Nano: 6MB, 37.3 mAP50, 45 FPS on Jetson Nano
- YOLOv8 Small: 22MB, 44.9 mAP50, 28 FPS on Jetson Nano
- MobileNet SSD: 19MB, 35.2 mAP50, 30 FPS on Raspberry Pi 4
- EfficientDet-Lite: 5.5MB, 33.5 mAP50, optimized for mobile
For most edge applications, YOLOv8 Nano provides the best balance. Its compact size and single-stage architecture make it ideal for resource-constrained environments.
from ultralytics import YOLO
# Load YOLOv8 nano model
model = YOLO('yolov8n.pt')
# Validate performance
results = model.val(data='coco.yaml')
print(f"mAP50-95: {results.box.map:.3f}")
print(f"mAP50: {results.box.map50:.3f}")
# Export for edge deployment
model.export(format='onnx', simplify=True)
Quantization: INT8 for 4x Speedup
Quantization reduces model precision from FP32 to INT8, cutting model size by 75% and speeding up inference 2-4x with minimal accuracy loss (typically <2% mAP drop).
Post-Training Quantization
import torch
import onnx
from onnxruntime.quantization import quantize_dynamic
# Export to ONNX first
model = YOLO('yolov8n.pt')
model.export(format='onnx')
# Dynamic quantization (easiest method)
quantize_dynamic(
model_input='yolov8n.onnx',
model_output='yolov8n_int8.onnx',
weight_type=onnx.QuantType.QInt8
)
# For better results: Static quantization with calibration data
from onnxruntime.quantization import quantize_static, CalibrationDataReader
class ImageDataReader(CalibrationDataReader):
def __init__(self, calibration_images):
self.images = calibration_images
self.current = 0
def get_next(self):
if self.current >= len(self.images):
return None
img = self.preprocess(self.images[self.current])
self.current += 1
return {"images": img}
def rewind(self):
self.current = 0
# Use representative calibration data
calibration_reader = ImageDataReader(calibration_images)
quantize_static(
model_input='yolov8n.onnx',
model_output='yolov8n_int8_static.onnx',
calibration_data_reader=calibration_reader
)
TensorRT Optimization for NVIDIA Devices
NVIDIA TensorRT provides dramatic speedups on Jetson devices through layer fusion, kernel auto-tuning, and precision calibration.
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
class TRTDetector:
"""TensorRT inference wrapper for YOLO."""
def __init__(self, engine_path):
# Load TensorRT engine
self.logger = trt.Logger(trt.Logger.WARNING)
with open(engine_path, 'rb') as f:
runtime = trt.Runtime(self.logger)
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
# Allocate buffers
self.allocate_buffers()
def allocate_buffers(self):
"""Allocate GPU memory for input/output."""
self.inputs = []
self.outputs = []
self.bindings = []
for i in range(self.engine.num_io_tensors):
tensor_name = self.engine.get_tensor_name(i)
size = trt.volume(self.engine.get_tensor_shape(tensor_name))
dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
self.inputs.append({'host': host_mem, 'device': device_mem})
else:
self.outputs.append({'host': host_mem, 'device': device_mem})
def infer(self, image):
"""Run inference on image."""
# Preprocess
input_data = self.preprocess(image)
# Copy to GPU
np.copyto(self.inputs[0]['host'], input_data.ravel())
cuda.memcpy_htod(self.inputs[0]['device'], self.inputs[0]['host'])
# Execute
self.context.execute_v2(bindings=self.bindings)
# Copy from GPU
cuda.memcpy_dtoh(self.outputs[0]['host'], self.outputs[0]['device'])
# Postprocess
return self.postprocess(self.outputs[0]['host'])
def preprocess(self, image):
"""Resize and normalize image."""
resized = cv2.resize(image, (640, 640))
normalized = resized.astype(np.float32) / 255.0
transposed = np.transpose(normalized, (2, 0, 1))
batched = np.expand_dims(transposed, axis=0)
return np.ascontiguousarray(batched)
def postprocess(self, output):
"""Apply NMS and return detections."""
# Parse YOLO output format
boxes = output[:, :4]
scores = output[:, 4]
classes = output[:, 5]
# Filter by confidence
mask = scores > 0.5
return {
'boxes': boxes[mask],
'scores': scores[mask],
'classes': classes[mask]
}
# Convert ONNX to TensorRT
def build_trt_engine(onnx_path, engine_path, fp16=True):
"""Build TensorRT engine from ONNX model."""
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)
# Parse ONNX
with open(onnx_path, 'rb') as model:
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# Build config
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
# Enable FP16 for Jetson
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
# Build and save
engine = builder.build_serialized_network(network, config)
with open(engine_path, 'wb') as f:
f.write(engine)
return engine
# Usage
build_trt_engine('yolov8n.onnx', 'yolov8n.trt', fp16=True)
detector = TRTDetector('yolov8n.trt')
Multi-threaded Real-time Pipeline
Separate capture, inference, and display into threads for maximum throughput:
import cv2
import threading
import queue
from collections import deque
import time
class RealtimeDetector:
"""Multi-threaded real-time object detection."""
def __init__(self, model_path, camera_id=0):
self.model = TRTDetector(model_path)
self.camera_id = camera_id
# Thread-safe queues
self.frame_queue = queue.Queue(maxsize=2)
self.result_queue = queue.Queue(maxsize=2)
# FPS tracking
self.fps_buffer = deque(maxlen=30)
self.running = False
def capture_thread(self):
"""Capture frames from camera."""
cap = cv2.VideoCapture(self.camera_id)
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) # Minimize lag
while self.running:
ret, frame = cap.read()
if not ret:
continue
# Drop old frames
if self.frame_queue.full():
try:
self.frame_queue.get_nowait()
except queue.Empty:
pass
self.frame_queue.put(frame)
cap.release()
def inference_thread(self):
"""Run detection on frames."""
while self.running:
try:
frame = self.frame_queue.get(timeout=1)
except queue.Empty:
continue
# Detect objects
start = time.time()
detections = self.model.infer(frame)
inference_time = time.time() - start
# Calculate FPS
fps = 1.0 / inference_time
self.fps_buffer.append(fps)
# Store result
result = {
'frame': frame,
'detections': detections,
'fps': np.mean(self.fps_buffer)
}
if self.result_queue.full():
try:
self.result_queue.get_nowait()
except queue.Empty:
pass
self.result_queue.put(result)
def display_thread(self):
"""Display results."""
while self.running:
try:
result = self.result_queue.get(timeout=1)
except queue.Empty:
continue
# Draw detections
frame = self.draw_boxes(
result['frame'],
result['detections'],
result['fps']
)
cv2.imshow('Detection', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
self.running = False
cv2.destroyAllWindows()
def draw_boxes(self, frame, detections, fps):
"""Draw bounding boxes on frame."""
for box, score, cls in zip(
detections['boxes'],
detections['scores'],
detections['classes']
):
x1, y1, x2, y2 = map(int, box)
# Draw box
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
# Draw label
label = f"Class {int(cls)}: {score:.2f}"
cv2.putText(frame, label, (x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# Draw FPS
cv2.putText(frame, f"FPS: {fps:.1f}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
return frame
def run(self):
"""Start detection pipeline."""
self.running = True
# Start threads
threads = [
threading.Thread(target=self.capture_thread),
threading.Thread(target=self.inference_thread),
threading.Thread(target=self.display_thread)
]
for t in threads:
t.start()
for t in threads:
t.join()
# Run detector
detector = RealtimeDetector('yolov8n.trt', camera_id=0)
detector.run()
Optimization Techniques
💡 Performance Tips
- Input resolution: Use 416x416 instead of 640x640 for 2.4x speedup
- Confidence threshold: Increase to 0.5+ to reduce post-processing
- NMS threshold: 0.45 balances accuracy and speed
- Max detections: Limit to 100 objects per frame
- Frame skipping: Process every 2nd frame for 2x throughput
- ROI processing: Detect only in regions of interest
- Batch processing: Process multiple frames together on GPU
Power Management for Battery Devices
class AdaptiveDetector:
"""Adapt performance based on thermal/power state."""
def __init__(self):
self.modes = {
'high': {'resolution': 640, 'fps': 30, 'confidence': 0.5},
'medium': {'resolution': 416, 'fps': 20, 'confidence': 0.6},
'low': {'resolution': 320, 'fps': 10, 'confidence': 0.7}
}
self.current_mode = 'high'
def check_thermal(self):
"""Read device temperature."""
try:
with open('/sys/class/thermal/thermal_zone0/temp') as f:
temp = float(f.read()) / 1000.0
return temp
except:
return 50.0 # Default
def adjust_performance(self):
"""Switch modes based on temperature."""
temp = self.check_thermal()
if temp > 80:
self.current_mode = 'low'
elif temp > 70:
self.current_mode = 'medium'
else:
self.current_mode = 'high'
return self.modes[self.current_mode]
Benchmarking Your Deployment
def benchmark_model(model, num_runs=100):
"""Benchmark model performance."""
# Warmup
dummy_input = np.random.randn(1, 3, 640, 640).astype(np.float32)
for _ in range(10):
_ = model.infer(dummy_input)
# Benchmark
times = []
for _ in range(num_runs):
start = time.time()
_ = model.infer(dummy_input)
times.append(time.time() - start)
times = np.array(times) * 1000 # to ms
print(f"Mean: {np.mean(times):.2f} ms")
print(f"Std: {np.std(times):.2f} ms")
print(f"P50: {np.percentile(times, 50):.2f} ms")
print(f"P95: {np.percentile(times, 95):.2f} ms")
print(f"FPS: {1000 / np.mean(times):.1f}")
# Compare optimizations
print("FP32 ONNX:")
benchmark_model(onnx_model)
print("\nINT8 Quantized:")
benchmark_model(quantized_model)
print("\nTensorRT FP16:")
benchmark_model(trt_model)
Conclusion
Edge deployment requires systematic optimization: choose lightweight architectures, apply quantization, leverage hardware acceleration with TensorRT, and implement efficient pipelines.
The key is balancing the accuracy-speed-power triangle for your specific application. Not every use case needs 60 FPS at 640x640—often 30 FPS at 416x416 hits the sweet spot.
Resources
- Ultralytics YOLOv8: https://docs.ultralytics.com/
- NVIDIA TensorRT: https://docs.nvidia.com/deeplearning/tensorrt/
- ONNX Runtime: https://onnxruntime.ai/
- Jetson AI Courses: https://developer.nvidia.com/embedded/learn/jetson-ai-certification