Add error handling for training

2025-12-06 13:34:13 +03:00 · 2025-11-09 13:34:39 -07:00 · 2025-11-09 13:34:39 -07:00 · 6c47a131e4
commit 6c47a131e4
parent 292d024aac
1 changed files with 132 additions and 85 deletions
--- a/frigate/util/classification.py
+++ b/frigate/util/classification.py
@ -130,7 +130,8 @@ class ClassificationTrainingProcess(FrigateProcess):
    def run(self) -> None:
        self.pre_run_setup()
-        self.__train_classification_model()
+        success = self.__train_classification_model()
        exit(0 if success else 1)
    def __generate_representative_dataset_factory(self, dataset_dir: str):
        def generate_representative_dataset():
@ -153,89 +154,117 @@ class ClassificationTrainingProcess(FrigateProcess):
    @redirect_output_to_logger(logger, logging.DEBUG)
    def __train_classification_model(self) -> bool:
        """Train a classification model."""
        try:
            # import in the function so that tensorflow is not initialized multiple times
            import tensorflow as tf
            from tensorflow.keras import layers, models, optimizers
            from tensorflow.keras.applications import MobileNetV2
            from tensorflow.keras.preprocessing.image import ImageDataGenerator
-        # import in the function so that tensorflow is not initialized multiple times
+            dataset_dir = os.path.join(CLIPS_DIR, self.model_name, "dataset")
-        import tensorflow as tf
+            model_dir = os.path.join(MODEL_CACHE_DIR, self.model_name)
-        from tensorflow.keras import layers, models, optimizers
+            os.makedirs(model_dir, exist_ok=True)
        from tensorflow.keras.applications import MobileNetV2
        from tensorflow.keras.preprocessing.image import ImageDataGenerator
-        logger.info(f"Kicking off classification training for {self.model_name}.")
+            num_classes = len(
-        dataset_dir = os.path.join(CLIPS_DIR, self.model_name, "dataset")
+                [
-        model_dir = os.path.join(MODEL_CACHE_DIR, self.model_name)
+                    d
-        os.makedirs(model_dir, exist_ok=True)
+                    for d in os.listdir(dataset_dir)
-        num_classes = len(
+                    if os.path.isdir(os.path.join(dataset_dir, d))
-            [
+                ]
-                d
+            )
                for d in os.listdir(dataset_dir)
                if os.path.isdir(os.path.join(dataset_dir, d))
            ]
        )
-        # Start with imagenet base model with 35% of channels in each layer
+            if num_classes < 2:
-        base_model = MobileNetV2(
+                logger.error(
-            input_shape=(224, 224, 3),
+                    f"Training failed for {self.model_name}: Need at least 2 classes, found {num_classes}"
-            include_top=False,
+                )
-            weights="imagenet",
+                return False
            alpha=0.35,
        )
        base_model.trainable = False  # Freeze pre-trained layers
-        model = models.Sequential(
+            # Start with imagenet base model with 35% of channels in each layer
-            [
+            base_model = MobileNetV2(
-                base_model,
+                input_shape=(224, 224, 3),
-                layers.GlobalAveragePooling2D(),
+                include_top=False,
-                layers.Dense(128, activation="relu"),
+                weights="imagenet",
-                layers.Dropout(0.3),
+                alpha=0.35,
-                layers.Dense(num_classes, activation="softmax"),
+            )
-            ]
+            base_model.trainable = False  # Freeze pre-trained layers
        )
-        model.compile(
+            model = models.Sequential(
-            optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
+                [
-            loss="categorical_crossentropy",
+                    base_model,
-            metrics=["accuracy"],
+                    layers.GlobalAveragePooling2D(),
-        )
+                    layers.Dense(128, activation="relu"),
                    layers.Dropout(0.3),
                    layers.Dense(num_classes, activation="softmax"),
                ]
            )
-        # create training set
+            model.compile(
-        datagen = ImageDataGenerator(rescale=1.0 / 255, validation_split=0.2)
+                optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
-        train_gen = datagen.flow_from_directory(
+                loss="categorical_crossentropy",
-            dataset_dir,
+                metrics=["accuracy"],
-            target_size=(224, 224),
+            )
            batch_size=BATCH_SIZE,
            class_mode="categorical",
            subset="training",
        )
-        # write labelmap
+            # create training set
-        class_indices = train_gen.class_indices
+            datagen = ImageDataGenerator(rescale=1.0 / 255, validation_split=0.2)
-        index_to_class = {v: k for k, v in class_indices.items()}
+            train_gen = datagen.flow_from_directory(
-        sorted_classes = [index_to_class[i] for i in range(len(index_to_class))]
+                dataset_dir,
-        with open(os.path.join(model_dir, "labelmap.txt"), "w") as f:
+                target_size=(224, 224),
-            for class_name in sorted_classes:
+                batch_size=BATCH_SIZE,
-                f.write(f"{class_name}\n")
+                class_mode="categorical",
                subset="training",
            )
-        # train the model
+            total_images = train_gen.samples
-        model.fit(train_gen, epochs=EPOCHS, verbose=0)
+            logger.debug(
                f"Training {self.model_name}: {total_images} images across {num_classes} classes"
            )
-        # convert model to tflite
+            # write labelmap
-        converter = tf.lite.TFLiteConverter.from_keras_model(model)
+            class_indices = train_gen.class_indices
-        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+            index_to_class = {v: k for k, v in class_indices.items()}
-        converter.representative_dataset = (
+            sorted_classes = [index_to_class[i] for i in range(len(index_to_class))]
-            self.__generate_representative_dataset_factory(dataset_dir)
+            with open(os.path.join(model_dir, "labelmap.txt"), "w") as f:
-        )
+                for class_name in sorted_classes:
-        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+                    f.write(f"{class_name}\n")
        converter.inference_input_type = tf.uint8
        converter.inference_output_type = tf.uint8
        tflite_model = converter.convert()
-        # write model
+            # train the model
-        with open(os.path.join(model_dir, "model.tflite"), "wb") as f:
+            logger.debug(f"Training {self.model_name} for {EPOCHS} epochs...")
-            f.write(tflite_model)
+            model.fit(train_gen, epochs=EPOCHS, verbose=0)
            logger.debug(f"Converting {self.model_name} to TFLite...")
-        # write training metadata with image count
+            # convert model to tflite
-        dataset_image_count = get_dataset_image_count(self.model_name)
+            converter = tf.lite.TFLiteConverter.from_keras_model(model)
-        write_training_metadata(self.model_name, dataset_image_count)
+            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            converter.representative_dataset = (
                self.__generate_representative_dataset_factory(dataset_dir)
            )
            converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
            converter.inference_input_type = tf.uint8
            converter.inference_output_type = tf.uint8
            tflite_model = converter.convert()
            # write model
            model_path = os.path.join(model_dir, "model.tflite")
            with open(model_path, "wb") as f:
                f.write(tflite_model)
            # verify model file was written successfully
            if not os.path.exists(model_path) or os.path.getsize(model_path) == 0:
                logger.error(
                    f"Training failed for {self.model_name}: Model file was not created or is empty"
                )
                return False
            # write training metadata with image count
            dataset_image_count = get_dataset_image_count(self.model_name)
            write_training_metadata(self.model_name, dataset_image_count)
            logger.info(f"Finished training {self.model_name}")
            return True
        except Exception as e:
            logger.error(f"Training failed for {self.model_name}: {e}", exc_info=True)
            return False
 def kickoff_model_training(
@ -257,18 +286,36 @@ def kickoff_model_training(
    training_process.start()
    training_process.join()
-    # reload model and mark training as complete
+    # check if training succeeded by examining the exit code
-    embeddingRequestor.send_data(
+    training_success = training_process.exitcode == 0
-        EmbeddingsRequestEnum.reload_classification_model.value,
+
-        {"model_name": model_name},
+    if training_success:
-    )
+        # reload model and mark training as complete
-    requestor.send_data(
+        embeddingRequestor.send_data(
-        UPDATE_MODEL_STATE,
+            EmbeddingsRequestEnum.reload_classification_model.value,
-        {
+            {"model_name": model_name},
-            "model": model_name,
+        )
-            "state": ModelStatusTypesEnum.complete,
+        requestor.send_data(
-        },
+            UPDATE_MODEL_STATE,
-    )
+            {
                "model": model_name,
                "state": ModelStatusTypesEnum.complete,
            },
        )
    else:
        logger.error(
            f"Training subprocess failed for {model_name} (exit code: {training_process.exitcode})"
        )
        # mark training as complete (not failed) so UI doesn't stay in training state
        # but don't reload the model since it failed
        requestor.send_data(
            UPDATE_MODEL_STATE,
            {
                "model": model_name,
                "state": ModelStatusTypesEnum.complete,
            },
        )
    requestor.stop()