Agent skill
computer-vision
Image processing, object detection, segmentation, and vision models. Use for image classification, object detection, or visual analysis tasks.
Stars
4
Forks
1
Install this agent skill to your Project
npx add-skill https://github.com/pluginagentmarketplace/custom-plugin-ai-data-scientist/tree/main/skills/computer-vision
SKILL.md
Computer Vision
Build models to analyze and understand visual data.
Quick Start
Image Classification
python
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
# Load pre-trained model
model = models.resnet50(pretrained=True)
model.eval()
# Preprocess image
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
img = Image.open('image.jpg')
img_tensor = transform(img).unsqueeze(0)
# Predict
with torch.no_grad():
output = model(img_tensor)
probabilities = torch.nn.functional.softmax(output[0], dim=0)
top5 = torch.topk(probabilities, 5)
print(top5)
Custom CNN
python
import torch.nn as nn
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
Data Augmentation
python
from torchvision import transforms
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(
brightness=0.2,
contrast=0.2,
saturation=0.2,
hue=0.1
),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
Object Detection with YOLO
python
from ultralytics import YOLO
# Load model
model = YOLO('yolov8n.pt')
# Predict
results = model('image.jpg')
# Process results
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
confidence = box.conf[0]
class_id = box.cls[0]
print(f"Class: {class_id}, Confidence: {confidence:.2f}")
print(f"Box: ({x1}, {y1}, {x2}, {y2})")
# Save results
results[0].save('output.jpg')
Image Segmentation
python
# Semantic segmentation with DeepLab
model = torch.hub.load(
'pytorch/vision:v0.10.0',
'deeplabv3_resnet50',
pretrained=True
)
model.eval()
# Preprocess
preprocess = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
input_tensor = preprocess(img).unsqueeze(0)
# Predict
with torch.no_grad():
output = model(input_tensor)['out'][0]
output_predictions = output.argmax(0)
Transfer Learning
python
from torchvision import models
# Load pre-trained ResNet
model = models.resnet50(pretrained=True)
# Freeze all layers
for param in model.parameters():
param.requires_grad = False
# Replace final layer
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
# Train only final layer
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
Image Processing with OpenCV
python
import cv2
# Read image
img = cv2.imread('image.jpg')
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Edge detection
edges = cv2.Canny(gray, 100, 200)
# Blur
blurred = cv2.GaussianBlur(img, (5, 5), 0)
# Resize
resized = cv2.resize(img, (224, 224))
# Draw rectangle
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
# Save
cv2.imwrite('output.jpg', img)
Face Detection
python
# Haar Cascade
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.1, 4)
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
Common Architectures
Image Classification:
- ResNet: Skip connections, deep networks
- EfficientNet: Compound scaling, efficient
- Vision Transformer (ViT): Attention-based
Object Detection:
- YOLO: Real-time, one-stage
- Faster R-CNN: Two-stage, accurate
- RetinaNet: Focal loss, handles class imbalance
Segmentation:
- U-Net: Encoder-decoder, medical imaging
- DeepLab: Atrous convolution, semantic segmentation
- Mask R-CNN: Instance segmentation
Tips
- Use pre-trained models for transfer learning
- Apply data augmentation to prevent overfitting
- Normalize images (ImageNet statistics)
- Use appropriate loss functions (CrossEntropy, Focal Loss)
- Monitor training with visualization
- Test on diverse images
Didn't find tool you were looking for?