Add error handling for training

This commit is contained in:
Nicolas Mowen 2025-11-09 13:34:39 -07:00
parent 292d024aac
commit 6c47a131e4

View File

@ -130,7 +130,8 @@ class ClassificationTrainingProcess(FrigateProcess):
def run(self) -> None: def run(self) -> None:
self.pre_run_setup() self.pre_run_setup()
self.__train_classification_model() success = self.__train_classification_model()
exit(0 if success else 1)
def __generate_representative_dataset_factory(self, dataset_dir: str): def __generate_representative_dataset_factory(self, dataset_dir: str):
def generate_representative_dataset(): def generate_representative_dataset():
@ -153,89 +154,117 @@ class ClassificationTrainingProcess(FrigateProcess):
@redirect_output_to_logger(logger, logging.DEBUG) @redirect_output_to_logger(logger, logging.DEBUG)
def __train_classification_model(self) -> bool: def __train_classification_model(self) -> bool:
"""Train a classification model.""" """Train a classification model."""
try:
# import in the function so that tensorflow is not initialized multiple times
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# import in the function so that tensorflow is not initialized multiple times dataset_dir = os.path.join(CLIPS_DIR, self.model_name, "dataset")
import tensorflow as tf model_dir = os.path.join(MODEL_CACHE_DIR, self.model_name)
from tensorflow.keras import layers, models, optimizers os.makedirs(model_dir, exist_ok=True)
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
logger.info(f"Kicking off classification training for {self.model_name}.") num_classes = len(
dataset_dir = os.path.join(CLIPS_DIR, self.model_name, "dataset") [
model_dir = os.path.join(MODEL_CACHE_DIR, self.model_name) d
os.makedirs(model_dir, exist_ok=True) for d in os.listdir(dataset_dir)
num_classes = len( if os.path.isdir(os.path.join(dataset_dir, d))
[ ]
d )
for d in os.listdir(dataset_dir)
if os.path.isdir(os.path.join(dataset_dir, d))
]
)
# Start with imagenet base model with 35% of channels in each layer if num_classes < 2:
base_model = MobileNetV2( logger.error(
input_shape=(224, 224, 3), f"Training failed for {self.model_name}: Need at least 2 classes, found {num_classes}"
include_top=False, )
weights="imagenet", return False
alpha=0.35,
)
base_model.trainable = False # Freeze pre-trained layers
model = models.Sequential( # Start with imagenet base model with 35% of channels in each layer
[ base_model = MobileNetV2(
base_model, input_shape=(224, 224, 3),
layers.GlobalAveragePooling2D(), include_top=False,
layers.Dense(128, activation="relu"), weights="imagenet",
layers.Dropout(0.3), alpha=0.35,
layers.Dense(num_classes, activation="softmax"), )
] base_model.trainable = False # Freeze pre-trained layers
)
model.compile( model = models.Sequential(
optimizer=optimizers.Adam(learning_rate=LEARNING_RATE), [
loss="categorical_crossentropy", base_model,
metrics=["accuracy"], layers.GlobalAveragePooling2D(),
) layers.Dense(128, activation="relu"),
layers.Dropout(0.3),
layers.Dense(num_classes, activation="softmax"),
]
)
# create training set model.compile(
datagen = ImageDataGenerator(rescale=1.0 / 255, validation_split=0.2) optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
train_gen = datagen.flow_from_directory( loss="categorical_crossentropy",
dataset_dir, metrics=["accuracy"],
target_size=(224, 224), )
batch_size=BATCH_SIZE,
class_mode="categorical",
subset="training",
)
# write labelmap # create training set
class_indices = train_gen.class_indices datagen = ImageDataGenerator(rescale=1.0 / 255, validation_split=0.2)
index_to_class = {v: k for k, v in class_indices.items()} train_gen = datagen.flow_from_directory(
sorted_classes = [index_to_class[i] for i in range(len(index_to_class))] dataset_dir,
with open(os.path.join(model_dir, "labelmap.txt"), "w") as f: target_size=(224, 224),
for class_name in sorted_classes: batch_size=BATCH_SIZE,
f.write(f"{class_name}\n") class_mode="categorical",
subset="training",
)
# train the model total_images = train_gen.samples
model.fit(train_gen, epochs=EPOCHS, verbose=0) logger.debug(
f"Training {self.model_name}: {total_images} images across {num_classes} classes"
)
# convert model to tflite # write labelmap
converter = tf.lite.TFLiteConverter.from_keras_model(model) class_indices = train_gen.class_indices
converter.optimizations = [tf.lite.Optimize.DEFAULT] index_to_class = {v: k for k, v in class_indices.items()}
converter.representative_dataset = ( sorted_classes = [index_to_class[i] for i in range(len(index_to_class))]
self.__generate_representative_dataset_factory(dataset_dir) with open(os.path.join(model_dir, "labelmap.txt"), "w") as f:
) for class_name in sorted_classes:
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] f.write(f"{class_name}\n")
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8
tflite_model = converter.convert()
# write model # train the model
with open(os.path.join(model_dir, "model.tflite"), "wb") as f: logger.debug(f"Training {self.model_name} for {EPOCHS} epochs...")
f.write(tflite_model) model.fit(train_gen, epochs=EPOCHS, verbose=0)
logger.debug(f"Converting {self.model_name} to TFLite...")
# write training metadata with image count # convert model to tflite
dataset_image_count = get_dataset_image_count(self.model_name) converter = tf.lite.TFLiteConverter.from_keras_model(model)
write_training_metadata(self.model_name, dataset_image_count) converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = (
self.__generate_representative_dataset_factory(dataset_dir)
)
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8
tflite_model = converter.convert()
# write model
model_path = os.path.join(model_dir, "model.tflite")
with open(model_path, "wb") as f:
f.write(tflite_model)
# verify model file was written successfully
if not os.path.exists(model_path) or os.path.getsize(model_path) == 0:
logger.error(
f"Training failed for {self.model_name}: Model file was not created or is empty"
)
return False
# write training metadata with image count
dataset_image_count = get_dataset_image_count(self.model_name)
write_training_metadata(self.model_name, dataset_image_count)
logger.info(f"Finished training {self.model_name}")
return True
except Exception as e:
logger.error(f"Training failed for {self.model_name}: {e}", exc_info=True)
return False
def kickoff_model_training( def kickoff_model_training(
@ -257,18 +286,36 @@ def kickoff_model_training(
training_process.start() training_process.start()
training_process.join() training_process.join()
# reload model and mark training as complete # check if training succeeded by examining the exit code
embeddingRequestor.send_data( training_success = training_process.exitcode == 0
EmbeddingsRequestEnum.reload_classification_model.value,
{"model_name": model_name}, if training_success:
) # reload model and mark training as complete
requestor.send_data( embeddingRequestor.send_data(
UPDATE_MODEL_STATE, EmbeddingsRequestEnum.reload_classification_model.value,
{ {"model_name": model_name},
"model": model_name, )
"state": ModelStatusTypesEnum.complete, requestor.send_data(
}, UPDATE_MODEL_STATE,
) {
"model": model_name,
"state": ModelStatusTypesEnum.complete,
},
)
else:
logger.error(
f"Training subprocess failed for {model_name} (exit code: {training_process.exitcode})"
)
# mark training as complete (not failed) so UI doesn't stay in training state
# but don't reload the model since it failed
requestor.send_data(
UPDATE_MODEL_STATE,
{
"model": model_name,
"state": ModelStatusTypesEnum.complete,
},
)
requestor.stop() requestor.stop()