Previous
Detect objects
Detection tells you where objects are in an image. Classification tells you what the entire image (or a region of it) contains. If you need to answer “is this a picture of a cat or a dog?” rather than “where are the cats and dogs in this picture?”, classification is the right tool. This how-to shows you how to get classifications from your vision service and use them in application logic.
A classification is a label paired with a confidence score. Unlike detections, classifications do not include spatial information – there are no bounding box coordinates. Each classification describes the image as a whole.
| Field | Type | Description |
|---|---|---|
class_name | String | The label assigned by the model (for example, “cat”, “dog”, “empty”) |
confidence | Float (0.0-1.0) | How confident the model is in this label |
A single classification call returns multiple results ranked by confidence. The count parameter controls how many top results to return.
Single-label classification assumes the image belongs to exactly one category. The confidences across all classes sum to approximately 1.0. If the model says “cat: 0.85”, it implicitly says “not-dog: 0.15”. Most classification models work this way.
Multi-label classification allows the image to belong to multiple categories simultaneously. An image could be classified as both “outdoor: 0.92” and “sunny: 0.88” at the same time. The confidences are independent and do not sum to 1.0.
The API is the same for both. The difference is in how the model was trained and how you interpret the results.
The count parameter tells the vision service how many top classifications to return. If count=3, you get the three highest-confidence labels. This is useful for:
| Use classification when | Use detection when |
|---|---|
| You care about the whole scene | You care about individual objects |
| You need a yes/no or category answer | You need object locations |
| There is one dominant subject | There are multiple objects to find |
| You want to sort or categorize | You want to count or track |
You can use both on the same camera. Run a classifier for scene-level understanding and a detector for object-level detail.
The simplest approach lets the vision service capture an image and classify it in one call.
import asyncio
from viam.robot.client import RobotClient
from viam.services.vision import VisionClient
async def main():
opts = RobotClient.Options.with_api_key(
api_key="YOUR-API-KEY",
api_key_id="YOUR-API-KEY-ID"
)
robot = await RobotClient.at_address("YOUR-MACHINE-ADDRESS", opts)
classifier = VisionClient.from_robot(robot, "my-classifier")
# Get top 3 classifications from the camera
classifications = await classifier.get_classifications_from_camera(
"my-camera", count=3
)
for c in classifications:
print(f"{c.class_name}: {c.confidence:.2f}")
await robot.close()
if __name__ == "__main__":
asyncio.run(main())
package main
import (
"context"
"fmt"
"go.viam.com/rdk/logging"
"go.viam.com/rdk/robot/client"
"go.viam.com/rdk/services/vision"
"go.viam.com/utils/rpc"
)
func main() {
ctx := context.Background()
logger := logging.NewLogger("classify")
machine, err := client.New(ctx, "YOUR-MACHINE-ADDRESS", logger,
client.WithDialOptions(rpc.WithEntityCredentials(
"YOUR-API-KEY-ID",
rpc.Credentials{
Type: rpc.CredentialsTypeAPIKey,
Payload: "YOUR-API-KEY",
})),
)
if err != nil {
logger.Fatal(err)
}
defer machine.Close(ctx)
classifier, err := vision.FromProvider(machine, "my-classifier")
if err != nil {
logger.Fatal(err)
}
// Get top 3 classifications from the camera
classifications, err := classifier.ClassificationsFromCamera(
ctx, "my-camera", 3, nil,
)
if err != nil {
logger.Fatal(err)
}
for _, c := range classifications {
fmt.Printf("%s: %.2f\n", c.Label(), c.Score())
}
}
If you have an image from a previous capture or a file, classify it directly.
from viam.components.camera import Camera
from viam.services.vision import VisionClient
camera = Camera.from_robot(robot, "my-camera")
classifier = VisionClient.from_robot(robot, "my-classifier")
# Capture images from the camera
images, _ = await camera.get_images()
# Classify the first image
classifications = await classifier.get_classifications(images[0], count=5)
for c in classifications:
print(f"{c.class_name}: {c.confidence:.2f}")
cam, err := camera.FromProvider(machine, "my-camera")
if err != nil {
logger.Fatal(err)
}
classifier, err := vision.FromProvider(machine, "my-classifier")
if err != nil {
logger.Fatal(err)
}
images, _, err := cam.Images(ctx, nil, nil)
if err != nil {
logger.Fatal(err)
}
img, err := images[0].Image(ctx)
if err != nil {
logger.Fatal(err)
}
classifications, err := classifier.Classifications(ctx, img, 5, nil)
if err != nil {
logger.Fatal(err)
}
for _, c := range classifications {
fmt.Printf("%s: %.2f\n", c.Label(), c.Score())
}
Classification is most useful when it drives application logic. Use the top classification to branch your program’s behavior.
classifications = await classifier.get_classifications_from_camera(
"my-camera", count=1
)
if not classifications:
print("No classification result")
elif classifications[0].confidence < 0.5:
print(f"Uncertain: {classifications[0].class_name} "
f"({classifications[0].confidence:.2f})")
else:
label = classifications[0].class_name
confidence = classifications[0].confidence
print(f"Classified as: {label} ({confidence:.2f})")
if label == "defective":
print("Triggering rejection mechanism")
# Add your action here
elif label == "good":
print("Part passes inspection")
classifications, err := classifier.ClassificationsFromCamera(
ctx, "my-camera", 1, nil,
)
if err != nil {
logger.Fatal(err)
}
if len(classifications) == 0 {
fmt.Println("No classification result")
} else if classifications[0].Score() < 0.5 {
fmt.Printf("Uncertain: %s (%.2f)\n",
classifications[0].Label(), classifications[0].Score())
} else {
label := classifications[0].Label()
confidence := classifications[0].Score()
fmt.Printf("Classified as: %s (%.2f)\n", label, confidence)
switch label {
case "defective":
fmt.Println("Triggering rejection mechanism")
// Add your action here
case "good":
fmt.Println("Part passes inspection")
}
}
Run classifications in a loop to monitor a scene over time. Track when the classification changes.
import asyncio
import time
classifier = VisionClient.from_robot(robot, "my-classifier")
previous_label = ""
while True:
classifications = await classifier.get_classifications_from_camera(
"my-camera", count=1
)
if classifications and classifications[0].confidence >= 0.6:
current_label = classifications[0].class_name
confidence = classifications[0].confidence
if current_label != previous_label:
print(f"Scene changed: {previous_label or 'unknown'} "
f"-> {current_label} ({confidence:.2f})")
previous_label = current_label
else:
if previous_label:
print(f"Scene uncertain (was: {previous_label})")
await asyncio.sleep(0.5)
classifier, err := vision.FromProvider(machine, "my-classifier")
if err != nil {
logger.Fatal(err)
}
previousLabel := ""
for {
classifications, err := classifier.ClassificationsFromCamera(
ctx, "my-camera", 1, nil,
)
if err != nil {
logger.Error(err)
time.Sleep(time.Second)
continue
}
if len(classifications) > 0 && classifications[0].Score() >= 0.6 {
currentLabel := classifications[0].Label()
confidence := classifications[0].Score()
if currentLabel != previousLabel {
if previousLabel == "" {
fmt.Printf("Scene changed: unknown -> %s (%.2f)\n",
currentLabel, confidence)
} else {
fmt.Printf("Scene changed: %s -> %s (%.2f)\n",
previousLabel, currentLabel, confidence)
}
previousLabel = currentLabel
}
} else if previousLabel != "" {
fmt.Printf("Scene uncertain (was: %s)\n", previousLabel)
}
time.Sleep(500 * time.Millisecond)
}
You can add a transform camera that overlays classification results directly on the camera feed. This is useful for debugging and monitoring without writing code.
Add a transform camera to your configuration:
{
"name": "classified-feed",
"api": "rdk:component:camera",
"model": "transform",
"attributes": {
"source": "my-camera",
"pipeline": [
{
"type": "classifications",
"attributes": {
"classifier_name": "my-classifier",
"confidence_threshold": 0.5,
"max_classifications": 3
}
}
]
}
}
After saving, view the classified-feed camera in the CONTROL tab. You will see classification labels overlaid on the camera image.
If you need the image and its classifications together in one call, use CaptureAllFromCamera. This is more efficient than separate calls and ensures the classifications correspond exactly to the returned image. See Detect Objects, step 7 for a full example.
count parameter to 5 or 10 and observe how confidence distributes across labels.Was this page helpful?
Glad to hear it! If you have any other feedback please let us know:
We're sorry about that. To help us improve, please tell us what we can do better:
Thank you!