import os, sys
import cv2
import numpy as np
sys.path.append('.')
from c4dynamics import c4d
from c4dynamics import pixelpoint
from typing import Optional
MODEL_SIZE = (416, 416, 3)
[docs]
class yolov3:
'''
YOLO: Real-Time Object Detection
:class:`yolov3` is a YOLOv3 (You Only Look Once) object detection model.
Though it is no longer the most accurate object detection algorithm,
YOLOv3 is still a very good choice when you need real-time detection
while maintaining excellent accuracy.
YOLOv3 processes an entire image in a single forward pass,
making it efficient for dynamic scenes.
Its key strength lies the ability to simultaneously
predict bounding box coordinates and class probabilities
for multiple objects within an image.
Parameters
==========
weights_path : str, optional
Path to the YOLOv3 weights file. Defaults None.
See Also
========
.filters
.pixelpoint
**Classes**
Using YOLOv3 means
object detection capability with the 80 pre-trained
classes that come with the COCO dataset.
The following 80 classes are available using COCO's pre-trained weights:
.. admonition:: COCO dataset
person, bicycle, car, motorcycle, airplane, bus, train, truck, boat,
traffic light, fire hydrant, stop sign, parking meter, bench, bird, cat,
dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack,
umbrella, handbag, tie, suitcase, frisbee, skis,snowboard, sports ball,
kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket,
bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple,
sandwich, orange, broccoli, carrot, hot dog, pizza, donut, cake, chair,
couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book,
clock, vase, scissors, teddy bear, hair drier, toothbrush
.. figure:: /_architecture/yolo-object-detection.jpg
*Figure 1*:
Object Detection with YOLO using COCO pre-trained classes 'dog', 'bicycle', 'truck'.
Read more at: `darknet-yolo <https://pjreddie.com/darknet/yolo/>`_ .
**Implementation (c4dynamics)**
The :class:`yolov3` class abstracts the complexities of model initialization,
input preprocessing, and output parsing.
The :meth:`detect` method returns a
:class:`pixelpoint <c4dynamics.states.lib.pixelpoint.pixelpoint>`
for each detected object.
The `pixelpoint` is a :mod:`predefined state class <c4dynamics.states.lib>`
representing a data point in a video frame with an associated bounding box.
Its methods and properties enhance the YOLOv3 output structure,
providing a convenient data structure for handling tracking missions.
**Installation**
C4dynamics downloads
the YOLOv3' weights file
once at first call to :class:`yolov3` and saves the cache.
For further details see :mod:`datasets <c4dynamics.datasets>`.
Alternatively, the user can provide a path to his
own weights file using the parameter `weights_path`.
**Construction**
A YOLOv3 detector instance is created by making a direct call
to the `yolov3` constructor:
.. code::
>>> from c4dynamics.detectors import yolov3
>>> yolo3 = yolov3()
Fetched successfully
Initialization of the instance does not require any mandatory parameters.
Example
=======
The following snippet initializes the YOLOv3 model and
runs the `detect()` method on an image containing four airplanes.
The example uses the `datasets` module from `c4dynamics` to fetch an image.
For further details, see :mod:`c4dynamics.datasets`.
Import required packages:
.. code::
>>> import cv2
>>> import c4dynamics as c4d
>>> from matplotlib import pyplot as plt
Load YOLOv3 detector:
.. code::
>>> yolo3 = c4d.detectors.yolov3()
Fetched successfully
Fetch and read the image:
.. code::
>>> imagepath = c4d.datasets.image('planes')
Fetched successfully
>>> img = cv2.imread(imagepath)
Run YOLOv3 detector on an image:
.. code::
>>> pts = yolo3.detect(img)
Now `pts` consists of
:class:`pixelpoint <c4dynamics.states.lib.pixelpoint.pixelpoint>`
instances for each object detected in the frame.
Let's use the properties and methods of the `pixelpoint` class to
view the attributes of the detected objects:
.. code::
>>> def ptup(n): return '(' + str(n[0]) + ', ' + str(n[1]) + ')'
>>> print('{:^10} | {:^10} | {:^16} | {:^16} | {:^10} | {:^14}'.format('center x', 'center y', 'box top-left', 'box bottom-right', 'class', 'frame size')) # doctest: +IGNORE_OUTPUT
>>> for p in pts:
... print('{:^10d} | {:^10d} | {:^16} | {:^16} | {:^10} | {:^14}'.format(p.x, p.y, ptup(p.box[0]), ptup(p.box[1]), p.class_id, ptup(p.fsize))) # doctest: +IGNORE_OUTPUT
... cv2.rectangle(img, p.box[0], p.box[1], [0, 0, 0], 2) # +IGNORE_OUTPUT
... point = (int((p.box[0][0] + p.box[1][0]) / 2 - 75), p.box[1][1] + 22) # doctest: +IGNORE_OUTPUT
... cv2.putText(img, p.class_id, point, cv2.FONT_HERSHEY_SIMPLEX, 1, [0, 0, 0], 2) # doctest: +IGNORE_OUTPUT
center x | center y | box top-left | box bottom-right | class | frame size
615 | 295 | (562, 259) | (668, 331) | aeroplane | (1280, 720)
779 | 233 | (720, 199) | (838, 267) | aeroplane | (1280, 720)
635 | 189 | (578, 153) | (692, 225) | aeroplane | (1280, 720)
793 | 575 | (742, 540) | (844, 610) | aeroplane | (1280, 720)
.. code::
>>> plt.figure() # doctest: +IGNORE_OUTPUT
>>> plt.axis(False) # doctest: +IGNORE_OUTPUT
>>> plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # doctest: +IGNORE_OUTPUT
.. figure:: /_examples/yolov3/intro.png
'''
class_names = (
'person', 'bicycle', 'car', 'motorbike', 'aeroplane',
'bus', 'train', 'truck', 'boat', 'traffic',
'light', 'fire', 'hydrant', 'stop', 'sign',
'parking', 'meter', 'bench', 'bird', 'cat',
'dog', 'horse', 'sheep', 'cow', 'elephant',
'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
'snowboard', 'sports', 'ball', 'kite', 'baseball',
'bat', 'baseball', 'glove', 'skateboard', 'surfboard',
'tennis', 'racket', 'bottle', 'wine', 'glass',
'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli',
'carrot', 'hot', 'dog', 'pizza', 'donut',
'cake', 'chair', 'sofa', 'pottedplant', 'bed',
'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse',
'remote', 'keyboard', 'cell', 'phone', 'microwave')
_nms_th = 0.5
_confidence_th = 0.5
def __init__(self, weights_path: Optional[str] = None) -> None:
errormsg = ''
if weights_path is None:
weights_path = c4d.datasets.nn_model('YOLOv3')
errormsg = "Try to clear the cache by 'c4dynamics.datasets.clear_cache()'"
if not os.path.exists(weights_path):
raise FileNotFoundError(f"The file 'yolov3.weights' does not "
f"exist in: '{weights_path}'. {errormsg}")
cfg_path = os.path.join(os.path.dirname(__file__), 'yolov3.cfg')
# cfg_path = 'yolov3.cfg'
# coconames = os.path.join(yolodir, 'coco.names')
self.net = cv2.dnn.readNetFromDarknet(cfg_path, weights_path)
self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
ln = self.net.getLayerNames()
self.ln = [ln[i - 1] for i in self.net.getUnconnectedOutLayers()]
# with open(coconames, 'r') as f:
# self.class_names = f.read().strip().split('\n')
# self.__dict__.update(kwargs)
@property
def nms_th(self) -> float:
'''
Gets and sets the Non-Maximum Suppression (NMS) threshold.
Objects with confidence scores below this threshold are suppressed.
Parameters
----------
nms_th : float
The new threshold value for NMS during object detection.
Defaults: `nms_th = 0.5`.
Returns
-------
nms_th : float
The threshold value used for NMS during object detection.
Objects with confidence scores below this threshold are suppressed.
Example
-------
Import required packages:
.. code::
>>> import c4dynamics as c4d
>>> from matplotlib import pyplot as plt
>>> import cv2
Fetch 'planes.png' using the c4dynamics' datasets module (see :mod:`c4dynamics.datasets`):
.. code::
>>> impath = c4d.datasets.image('planes')
Fetched successfully
Load YOLOv3 detector and set 3 NMS threshold values to compare:
.. code::
>>> yolo3 = c4d.detectors.yolov3()
Fetched successfully
>>> nms_thresholds = [0.1, 0.5, 0.9]
Run the detector on each threshold:
.. code::
>>> _, axs = plt.subplots(1, 3)
>>> for i, nms_threshold in enumerate(nms_thresholds):
... yolo3.nms_th = nms_threshold
... img = cv2.imread(impath)
... pts = yolo3.detect(img)
... for p in pts:
... cv2.rectangle(img, p.box[0], p.box[1], [0, 255, 0], 2) # doctest: +IGNORE_OUTPUT
... axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # doctest: +IGNORE_OUTPUT
... axs[i].set_title(f"NMS Threshold: {nms_threshold}", fontsize = 6)
... axs[i].axis('off') # doctest: +IGNORE_OUTPUT
.. figure:: /_examples/yolov3/nms_th.png
A high value (0.9) for the Non-Maximum Suppression (NMS) threshold here
leads to an increased number of bounding boxes around a single object.
When the NMS threshold is high, it means that a significant overlap is
required for two bounding boxes to be considered redundant,
and one of them will be suppressed.
To address this issue, it's essential to choose an appropriate
NMS threshold based on the characteristics of your dataset and the
level of overlap between objects.
A lower NMS threshold (e.g., 0.4 or 0.5)
is commonly used to suppress redundant boxes effectively
while retaining accurate detections.
Experimenting with different
threshold values and observing their impact on the results is crucial
for optimizing the performance of object detection models.
'''
return self._nms_th
@nms_th.setter
def nms_th(self, val: float) -> None:
self._nms_th = val
@property
def confidence_th(self) -> float:
'''
Gets and sets the confidence threshold used in the object detection.
Detected objects with confidence scores below this threshold are filtered out.
Parameters
----------
confidence_th : float
The new confidence threshold for object detection.
Defaults: `confidence_th = 0.5`.
Returns
-------
confidence_th : float
The confidence threshold for object detection.
Detected objects with confidence scores below this threshold are filtered out.
Example
-------
Import required packages:
.. code::
>>> import c4dynamics as c4d
>>> from matplotlib import pyplot as plt
>>> import cv2
Fetch 'planes.png' using the c4dynamics' datasets module (see :mod:`c4dynamics.datasets`):
.. code::
>>> impath = c4d.datasets.image('planes')
Fetched successfully
Load YOLOv3 detector and set 3 confidence threshold values to compare:
.. code::
>>> yolo3 = c4d.detectors.yolov3()
Fetched successfully
>>> confidence_thresholds = [0.9, 0.95, 0.99]
Run the detector on each threshold:
.. code::
>>> _, axs = plt.subplots(1, 3)
>>> for i, confidence_threshold in enumerate(confidence_thresholds):
... yolo3.confidence_th = confidence_threshold
... img = cv2.imread(impath)
... pts = yolo3.detect(img)
... for p in pts:
... cv2.rectangle(img, p.box[0], p.box[1], [0, 255, 0], 2) # doctest: +IGNORE_OUTPUT
... axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # doctest: +IGNORE_OUTPUT
... axs[i].set_title(f"Confidence Threshold: {confidence_threshold}", fontsize = 6)
... axs[i].axis('off') # doctest: +IGNORE_OUTPUT
.. figure:: /_examples/yolov3/confidence_th.png
A single object being missed, particularly when setting the confidence threshold to 0.99,
suggests that the model is highly confident in its predictions.
This level of performance is typically achievable when the model
has been trained on a diverse and representative dataset,
encompassing a wide variety of object instances, backgrounds,
and conditions.
'''
return self._confidence_th
@confidence_th.setter
def confidence_th(self, val: float) -> None:
self._confidence_th = val
[docs]
def detect(self, frame: np.ndarray) -> list[pixelpoint]:
'''
Detects objects in a frame using the YOLOv3 model.
At each call, the detector performs the following steps:
1. Preprocesses the frame by creating a blob, normalizing pixel values, and swapping Red and Blue channels.
2. Sets input to the YOLOv3 model and performs a forward pass to obtain detections.
3. Extracts detected objects based on a confidence threshold, calculates bounding box coordinates, and filters results using Non-Maximum Suppression (NMS).
Parameters
----------
frame : numpy.array
An input frame for object detection.
Returns
-------
out : list[pixelpoint]
A list of :class:`pixelpoint <c4dynamics.states.pixelpoint.pixelpoint>` objects representing detected objects,
each containing bounding box coordinates and class label.
Examples
--------
**Setup**
Import required packages:
.. code::
>>> import cv2 # opencv-python
>>> import c4dynamics as c4d
>>> from matplotlib import pyplot as plt
Fetch 'planes.png' and 'aerobatics.mp4' using the c4dynamics' datasets module (see :mod:`c4dynamics.datasets`):
.. code::
>>> impath = c4d.datasets.image('planes')
Fetched successfully
>>> vidpath = c4d.datasets.video('aerobatics')
Fetched successfully
Load YOLOv3 detector:
.. code::
>>> yolo3 = c4d.detectors.yolov3()
Fetched successfully
Let the auxiliary function:
.. code::
>>> def ptup(n): return '(' + str(n[0]) + ', ' + str(n[1]) + ')'
**Object detection in a single frame**
.. code::
>>> img = cv2.imread(impath)
>>> pts = yolo3.detect(img)
>>> for p in pts:
... cv2.rectangle(img, p.box[0], p.box[1], [0, 255, 0], 2) # doctest: +IGNORE_OUTPUT
.. code::
>>> plt.figure() # doctest: +IGNORE_OUTPUT
>>> plt.axis(False) # doctest: +IGNORE_OUTPUT
>>> plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # doctest: +IGNORE_OUTPUT
.. figure:: /_examples/yolov3/single_image.png
**Object detection in a video**
.. code::
>>> video_cap = cv2.VideoCapture(vidpath)
>>> while video_cap.isOpened():
... ret, frame = video_cap.read()
... if not ret: break
... pts = yolo3.detect(frame)
... for p in pts:
... cv2.rectangle(frame, p.box[0], p.box[1], [0, 255, 0], 2) # doctest: +IGNORE_OUTPUT
... cv2.imshow('YOLOv3', frame) # doctest: +IGNORE_OUTPUT
... cv2.waitKey(10) # doctest: +IGNORE_OUTPUT
.. figure:: /_examples/yolov3/aerobatics.gif
**The output structure**
The output of the detect() function is a list of :class:`pixelpoint <c4dynamics.states.pixelpoint.pixelpoint>` object.
The :class:`pixelpoint <c4dynamics.states.pixelpoint.pixelpoint>` has unique attributes to manipulate the detected object class and
bounding box.
.. code::
>>> print('{:^10} | {:^10} | {:^10} | {:^16} | {:^16} | {:^10} | {:^14}' # doctest: +IGNORE_OUTPUT
... .format('# object', 'center x', 'center y', 'box top-left', 'box bottom-right', 'class', 'frame size'))
>>> # main loop:
>>> for i, p in enumerate(pts):
... print('{:^10d} | {:^10.3f} | {:^10.3f} | {:^16} | {:^16} | {:^10} | {:^14}'
... .format(i, p.x, p.y, ptup(p.box[0]), ptup(p.box[1]), p.class_id, ptup(p.fsize)))
... cv2.rectangle(img, p.box[0], p.box[1], [0, 0, 0], 2) # doctest: +IGNORE_OUTPUT
... point = (int((p.box[0][0] + p.box[1][0]) / 2 - 75), p.box[1][1] + 22)
... cv2.putText(img, p.class_id, point, cv2.FONT_HERSHEY_SIMPLEX, 1, [0, 0, 0], 2) # doctest: +IGNORE_OUTPUT
# object | center x | center y | box top-left | box bottom-right | class | frame size
0 | 0.584 | 0.376 | (691, 234) | (802, 306) | aeroplane | (1280, 720)
1 | 0.457 | 0.473 | (528, 305) | (642, 376) | aeroplane | (1280, 720)
2 | 0.471 | 0.322 | (542, 196) | (661, 267) | aeroplane | (1280, 720)
3 | 0.546 | 0.873 | (645, 588) | (752, 668) | aeroplane | (1280, 720)
.. code::
>>> plt.figure() # doctest: +IGNORE_OUTPUT
>>> plt.axis(False) # doctest: +IGNORE_OUTPUT
>>> plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # doctest: +IGNORE_OUTPUT
.. figure:: /_examples/yolov3/outformat.png
'''
#
# Step 1: Preprocess the Frame
# - Create a blob (binary large object) from the input frame with the
# specified dimensions
# - Normalize pixel values to a range of 0 to 1
# - Specify the dimensions of the input layer of the YOLOv3 model
# - Swap Red and Blue channels (BGR to RGB)
# - Set crop to False to preserve the original aspect ratio
##
blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (MODEL_SIZE[0], MODEL_SIZE[1]), swapRB = True, crop = False)
#
# Step 2: Set Input to the YOLOv3 Model and Perform Forward Pass
# - Set the blob as the input to the YOLOv3 model
# - Get the names of the output layers of the model
# - Perform a forward pass through the model to obtain detections
#
# The returning detection structure:
# 1) x_center
# 2) y_center
# 3) width
# 4) height
# 5) confidence score
# 6:end) probabilities for each class
##
self.net.setInput(blob)
detections = self.net.forward(self.ln)
#
# Step 3: Extract Detected Objects
# - Iterate through the detected objects in the forward pass results
# - Filter objects based on confidence threshold
# - Calculate bounding box coordinates and convert to integers
# - Append bounding box coordinates and class labels to respective lists
##
raw = [] # xc, yc, w, h
boxes = [] # top left x, top left y, width, height
classIDs = []
confidences = []
fheight, fwidth = frame.shape[:2]
for detection in detections:
for d in detection:
scores = d[5:]
classID = np.argmax(scores)
confidence = scores[classID]
if scores[classID] > self._confidence_th: # Adjust the confidence threshold as needed
box = d[:4] * [fwidth, fheight, fwidth, fheight] # relative (xc, yc, w, h) to pixels
# (center_x, center_y, width, height) = box.astype('int')
x = box[0] - box[2] / 2 # top left x
y = box[1] - box[3] / 2 # top left y
boxes.append([x, y, box[2], box[3]]) # top left x, top left y, width, height
confidences.append(float(confidence))
classIDs.append(classID)
raw.append(d[:4])
indices = np.array(cv2.dnn.NMSBoxes(boxes, confidences, self._confidence_th, self._nms_th))
# box_out = []
# class_out = []
points_out = []
if len(indices) > 0:
# for i in indices.flatten():
for i in indices.ravel():
# (x, y) = (boxes[i][0], boxes[i][1])
# (w, h) = (boxes[i][2], boxes[i][3])
# x top left, y top left, x bottom right, y bottom right
# box_out.append([boxes[i][0], boxes[i][1], boxes[i][0] + boxes[i][2], boxes[i][1] + boxes[i][3]])
# points_out.append(pixelpoint(raw[i], self.class_names[classIDs[i]], (w, h)))
pp = pixelpoint(x = int(raw[i][0] * fwidth), y = int(raw[i][1] * fheight), w = int(raw[i][2] * fwidth), h = int(raw[i][3] * fheight))
# pp.units = 'normalized'
pp.fsize = (fwidth, fheight)
pp.class_id = self.class_names[classIDs[i]]
points_out.append(pp)
# class_out.append(self.class_names[classIDs[i]])
# box_out = np.array(box_out)
return points_out # box_out, class_out,
if __name__ == "__main__":
import doctest, contextlib
from c4dynamics import IgnoreOutputChecker, cprint
# Register the custom OutputChecker
doctest.OutputChecker = IgnoreOutputChecker
tofile = False
optionflags = doctest.FAIL_FAST
if tofile:
with open(os.path.join('tests', '_out', 'output.txt'), 'w') as f:
with contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):
result = doctest.testmod(optionflags = optionflags)
else:
result = doctest.testmod(optionflags = optionflags)
if result.failed == 0:
cprint(os.path.basename(__file__) + ": all tests passed!", 'g')
else:
print(f"{result.failed}")