Skip to content

Behav classifier

behavysis.behav_classifier.behav_classifier.BehavClassifier

BehavClassifier abstract class peforms behav classifier model preparation, training, saving, evaluation, and inference.

Source code in behavysis/behav_classifier/behav_classifier.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
class BehavClassifier:
    """
    BehavClassifier abstract class peforms behav classifier model preparation, training, saving,
    evaluation, and inference.
    """

    logger = init_logger_file()

    _proj_dir: str
    _behav_name: str
    _clf: BaseTorchModel

    def __init__(self, proj_dir: str, behav_name: str) -> None:
        # Setting attributes
        self._proj_dir = os.path.abspath(proj_dir)
        self._behav_name = behav_name
        # Assert that the behaviour is scored in the project (in the scored_behavs directory)
        # Getting the list of behaviours in project to check against
        y_df = self.wrangle_columns_y(self.combine_dfs(self.y_dir))
        assert np.isin(behav_name, y_df.columns)
        # Trying to load configs (or making new)
        try:
            configs = BehavClassifierConfigs.read_json(self.configs_fp)
            self.logger.debug("Loaded existing configs")
        except FileNotFoundError:
            configs = BehavClassifierConfigs()
            self.logger.debug("Made new model configs")
        # Setting and saving configs
        configs.proj_dir = self._proj_dir
        configs.behav_name = self._behav_name
        self.configs = configs
        # Trying to load classifier (or making new)
        try:
            self.clf = joblib_load(self.clf_fp)
            self.logger.debug("Loaded existing classifier")
        except FileNotFoundError:
            self.clf = CNN1()
            self.logger.debug("Made new classifier")

    #################################################
    #            GETTER AND SETTERS
    #################################################

    @property
    def proj_dir(self) -> str:
        return self._proj_dir

    @property
    def behav_name(self) -> str:
        return self._behav_name

    @property
    def clf(self) -> BaseTorchModel:
        return self._clf

    @clf.setter
    def clf(self, clf: BaseTorchModel | str) -> None:
        # If a str, then loading
        if isinstance(clf, str):
            clf_name = clf
            self._clf = joblib_load(os.path.join(self.clfs_dir, clf, "classifier.sav"))
            self.logger.debug(f"Loaded classifier: {clf_name}")
        # If a BaseTorchModel, then setting
        else:
            clf_name = type(clf).__name__
            self._clf = clf
            self.logger.debug(f"Initialised classifier: {clf_name}")
        # Updating in configs
        configs = self.configs
        configs.clf_struct = clf_name
        self.configs = configs

    @property
    def model_dir(self) -> str:
        return os.path.join(self.proj_dir, "behav_models", self.behav_name)

    @property
    def configs_fp(self) -> str:
        return os.path.join(self.model_dir, "configs.json")

    @property
    def configs(self) -> BehavClassifierConfigs:
        return BehavClassifierConfigs.read_json(self.configs_fp)

    @configs.setter
    def configs(self, configs: BehavClassifierConfigs) -> None:
        try:
            if self.configs == configs:
                return
        except FileNotFoundError:
            pass
        self.logger.debug("Configs have changed. Updating model configs on disk")
        configs.write_json(self.configs_fp)

    @property
    def clfs_dir(self) -> str:
        return os.path.join(self.model_dir, "classifiers")

    @property
    def clf_dir(self) -> str:
        return os.path.join(self.clfs_dir, self.configs.clf_struct)

    @property
    def clf_fp(self) -> str:
        return os.path.join(self.clf_dir, "classifier.sav")

    @property
    def preproc_fp(self) -> str:
        return os.path.join(self.clf_dir, "preproc.sav")

    @property
    def eval_dir(self) -> str:
        return os.path.join(self.clf_dir, "evaluation")

    @property
    def x_dir(self) -> str:
        """
        Returns the model's x directory.
        It gets the features_extracted directory from the parent Behavysis model directory.
        """
        return os.path.join(self.proj_dir, Folders.FEATURES_EXTRACTED.value)

    @property
    def y_dir(self) -> str:
        """
        Returns the model's y directory.
        It gets the scored_behavs directory from the parent Behavysis model directory.
        """
        return os.path.join(self.proj_dir, Folders.SCORED_BEHAVS.value)

    #################################################
    # CREATE/LOAD MODEL METHODS
    #################################################

    @classmethod
    def create_from_project_dir(cls, proj_dir: str) -> list:
        """
        Loading classifier from given Behavysis project directory.
        """
        # Getting the list of behaviours (after wrangling column names)
        y_df = cls.wrangle_columns_y(cls.combine_dfs(os.path.join(proj_dir, Folders.SCORED_BEHAVS.value)))
        behavs_ls = y_df.columns.to_list()
        # For each behaviour, making a new BehavClassifier instance
        models_ls = [cls(proj_dir, behav) for behav in behavs_ls]
        return models_ls

    @classmethod
    def create_from_project(cls, proj: Project) -> list[BehavClassifier]:
        """
        Loading classifier from given Behavysis project instance.
        Wraps the `create_from_project_dir` method.
        """
        return cls.create_from_project_dir(proj.root_dir)

    @classmethod
    def load(cls, proj_dir: str, behav_name: str) -> BehavClassifier:
        """
        Reads the model from the expected model file.
        """
        # Checking that the configs file exists and is valid
        configs_fp = os.path.join(proj_dir, "behav_models", behav_name, "configs.json")
        try:
            BehavClassifierConfigs.read_json(configs_fp)
        except (FileNotFoundError, OSError):
            raise ValueError(
                f'Model in project directory, "{proj_dir}", and behav name, "{behav_name}", not found.\n'
                "Please check file path."
            )
        return cls(proj_dir, behav_name)

    ###############################################################################################
    #            COMBINING DFS TO SINGLE DF
    ###############################################################################################

    @classmethod
    def combine_dfs(cls, src_dir):
        """
        Combines the data in the given directory into a single dataframe.
        Adds a MultiIndex level to the rows, with the values as the filenames in the directory.
        """
        data_dict = {get_name(i): DFMixin.read(os.path.join(src_dir, i)) for i in os.listdir(os.path.join(src_dir))}
        df = pd.concat(data_dict.values(), axis=0, keys=data_dict.keys())
        df = BehavClassifierCombinedDf.basic_clean(df)
        return df

    ###############################################################################################
    #            PREPROCESSING DFS
    ###############################################################################################

    @staticmethod
    def _preproc_x_fit_select_cols(x: np.ndarray) -> np.ndarray:
        """
        Selects only the derived features (not the x-y-l columns).

        Used in the preprocessing pipeline.
        """
        return x[:, 48:]

    @classmethod
    def preproc_x_fit(cls, x: np.ndarray, preproc_fp: str) -> None:
        """
        The preprocessing steps are:
        - Select only the derived features (not the x-y-l columns)
            - 2 (indivs) * 8 (bpts) * 3 (coords) = 48 (columns) before derived features
        - MinMax scaling (using previously fitted MinMaxScaler)
        """
        preproc_pipe = Pipeline(
            steps=[
                ("select_columns", FunctionTransformer(cls._preproc_x_fit_select_cols)),
                ("min_max_scaler", MinMaxScaler()),
            ]
        )
        preproc_pipe.fit(x)
        joblib_dump(preproc_pipe, preproc_fp)

    @classmethod
    def preproc_x_transform(cls, x: np.ndarray, preproc_fp: str) -> np.ndarray:
        """
        Runs the preprocessing steps fitted from `preproc_x_fit` on the given `x` data.
        """
        preproc_pipe: Pipeline = joblib_load(preproc_fp)
        x_preproc = preproc_pipe.transform(x)
        return x_preproc

    @classmethod
    def wrangle_columns_y(cls, y: pd.DataFrame) -> pd.DataFrame:
        """
        Filters the `y` dataframe to only include the `behav` column and the specific outcome columns,
        and rename the columns to be in the format `{behav}__{outcome}`.
        """
        # Filtering out the pred columns (in the `outcomes` level)
        columns_filter = np.isin(
            y.columns.get_level_values(BehavScoredDf.CN.OUTCOMES.value),
            [BehavScoredDf.OutcomesCols.PRED.value],
            invert=True,
        )
        y = y.loc[:, columns_filter]
        # Setting the column names from `(behav, outcome)` to `{behav}__{outcome}`
        y.columns = [
            f"{behav_name}"
            if outcome_name == BehavScoredDf.OutcomesCols.ACTUAL.value
            else f"{behav_name}__{outcome_name}"
            for behav_name, outcome_name in y.columns
        ]
        return y

    @classmethod
    def oversample(cls, x: np.ndarray, y: np.ndarray, ratio: float) -> np.ndarray:
        assert x.shape[0] == y.shape[0]
        # Getting index
        index = np.arange(y.shape[0])
        # Getting indices where y is True
        t = index[y == BehavValues.BEHAV.value]
        # Getting indices where y is False
        f = index[y == BehavValues.NON_BEHAV.value]
        # Getting intended size (as t_len / f_len = ratio)
        new_t_size = int(np.round(f.shape[0] * ratio))
        # Oversampling the True indices
        t = np.random.choice(t, size=new_t_size, replace=True)
        # Combining the True and False indices
        new_index = np.concatenate([t, f])
        # Returning the resampled x
        return x[new_index]

    @classmethod
    def undersample(cls, x: np.ndarray, y: np.ndarray, ratio: float) -> np.ndarray:
        assert x.shape[0] == y.shape[0]
        # Getting index
        index = np.arange(y.shape[0])
        # Getting indices where y is True
        t = index[y == BehavValues.BEHAV.value]
        # Getting indices where y is False
        f = index[y == BehavValues.NON_BEHAV.value]
        # Getting intended size (as t_len / f_len = ratio)
        new_f_size = int(np.round(t.shape[0] / ratio))
        # Undersampling the False indices
        f = np.random.choice(f, size=new_f_size, replace=False)
        # Combining the True and False indices
        new_index = np.concatenate([t, f])
        # Returning the resampled x
        return x[new_index]

    #################################################
    #            PIPELINE FOR DATA PREP
    #################################################

    def preproc_training(
        self,
    ) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray]]:
        """
        Prepares the data for the training pipeline.

        Performs the following:
        - Combining dfs from x and y directories (individual experiment data).
        - Ensures the x and y dfs have the same index, and are in the same row order.
        - Preprocesses x df. Refer to `preprocess_x` for details.
        - Selects the y class (given in the configs file) from the y df.
        - Preprocesses y df. Refer to `preprocess_y` for details.
        - Splits into training and test indexes.
            - The training indexes are undersampled to the ratio given in the configs.

        Returns
        -------
        A tuple containing four numpy arrays:
        - x_ls: list of each dataframe's input data.
        - y_ls: list of each dataframe's target labels.
        - index_train_ls: list of each dataframe's indexes for the training data.
        - index_test_ls: list of each dataframe's indexes for the testing data.
        """
        # Getting the lists of x and y dfs
        x_fp_ls = [os.path.join(self.x_dir, i) for i in os.listdir(os.path.join(self.x_dir))]
        y_fp_ls = [os.path.join(self.y_dir, i) for i in os.listdir(os.path.join(self.y_dir))]
        x_df_ls = async_read_files_run(x_fp_ls, FeaturesDf.read)
        y_df_ls = async_read_files_run(y_fp_ls, BehavScoredDf.read)
        # Formatting y dfs (selecting column and replacing UNDETERMINED with NON_BEHAV values)
        y_df_ls = [
            y[(self.configs.behav_name, BehavScoredDf.OutcomesCols.ACTUAL.value)].replace(
                BehavValues.UNDETERMINED.value, BehavValues.NON_BEHAV.value
            )
            for y in y_df_ls
        ]
        # Ensuring x and y dfs have the same index and are in the same row order
        index_df_ls = [x.index.intersection(y.index) for x, y in zip(x_df_ls, y_df_ls)]
        x_df_ls = [x.loc[index] for x, index in zip(x_df_ls, index_df_ls)]
        y_df_ls = [y.loc[index] for y, index in zip(y_df_ls, index_df_ls)]
        assert np.all([x.shape[0] == y.shape[0] for x, y in zip(x_df_ls, y_df_ls)])
        # Converting to numpy arrays
        x_ls = [x.values for x in x_df_ls]
        y_ls = [y.values for y in y_df_ls]
        index_ls = [np.arange(x.shape[0]) for x in x_ls]
        # x preprocessing: fitting (across all x dfs) and transforming (for each x df)
        self.preproc_x_fit(np.concatenate(x_ls, axis=0), self.preproc_fp)
        x_ls = [self.preproc_x_transform(x, self.preproc_fp) for x in x_ls]
        # Making a 2D array of (df_index, index, y) for train-test splitting, stratifying and sampling
        index_flat = listofvects2array(index_ls, y_ls)
        # Splitting into train and test indexes
        index_train_flat, index_test_flat = train_test_split(
            index_flat,
            test_size=self.configs.test_split,
            stratify=index_flat[:, 2],
        )
        # Oversampling and undersampling ONLY on training data
        index_train_flat = self.oversample(index_train_flat, index_train_flat[:, 2], self.configs.oversample_ratio)
        index_train_flat = self.undersample(index_train_flat, index_train_flat[:, 2], self.configs.undersample_ratio)
        # Reshaping back to individual df index lists
        index_train_ls = array2listofvect(index_train_flat, 1)
        index_test_ls = array2listofvect(index_test_flat, 1)
        return x_ls, y_ls, index_train_ls, index_test_ls

    #################################################
    # PIPELINE FOR CLASSIFIER TRAINING AND INFERENCE
    #################################################

    def pipeline_training(self) -> None:
        """
        Makes a classifier and saves it to the model's root directory.

        Callable is a method from `ClfTemplates`.
        """
        self.logger.info(f"Training {self.configs.clf_struct}")
        # Preparing data
        x_ls, y_ls, index_train_ls, index_test_ls = self.preproc_training()
        # Training the model
        history = self.clf.fit(
            x_ls=x_ls,
            y_ls=y_ls,
            index_ls=index_train_ls,
            batch_size=self.configs.batch_size,
            epochs=self.configs.epochs,
            val_split=self.configs.val_split,
        )
        # Saving history
        self.clf_eval_save_history(history)
        # Evaluating on train and test data
        self.clf_eval_save_performance(x_ls, y_ls, index_train_ls, "train")
        self.clf_eval_save_performance(x_ls, y_ls, index_test_ls, "test")
        # Saving model
        joblib_dump(self.clf, self.clf_fp)

    def pipeline_training_all(self):
        """
        Making classifier for all available templates.
        """
        # Saving existing clf
        clf = self.clf
        for clf_cls in CLF_TEMPLATES:
            # Initialising the model
            self.clf = clf_cls()
            # Building pipeline, which runs and saves evaluation
            self.pipeline_training()
        # Restoring clf
        self.clf = clf

    def pipeline_inference(self, x_df: pd.DataFrame) -> pd.DataFrame:
        """
        Given the unprocessed features dataframe, runs the model pipeline to make predictions.

        Pipeline is:
        - Preprocess `x` df. Refer to
        [behavysis.behav_classifier.BehavClassifier.preproc_x][] for details.
        - Makes predictions and returns the predicted behaviours.
        """
        index = x_df.index
        # Preprocessing features
        x = self.preproc_x_transform(x_df.values, self.preproc_fp)
        # Loading the model
        self.clf = joblib_load(self.clf_fp)
        # Getting probabilities
        y_prob = self.clf.predict(
            x=x,
            index=np.arange(x.shape[0]),
            batch_size=self.configs.batch_size,
        )
        # Making predictions from probabilities (and pcutoff)
        y_pred = (y_prob > self.configs.pcutoff).astype(int)
        # Making df
        pred_df = BehavPredictedDf.init_df(pd.Series(index))
        pred_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PROB.value)] = y_prob
        pred_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PRED.value)] = y_pred
        return pred_df

    #################################################
    # COMPREHENSIVE EVALUATION FUNCTIONS
    #################################################

    def clf_eval_save_history(self, history: pd.DataFrame):
        # Saving history df
        DFMixin.write(history, os.path.join(self.eval_dir, f"history.{DFMixin.IO}"))
        # Making and saving history figure
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.lineplot(data=history, ax=ax)
        fig.savefig(os.path.join(self.eval_dir, "history.png"))

    def clf_eval_save_performance(
        self,
        x_ls: list[np.ndarray],
        y_ls: list[np.ndarray],
        index_ls: list[np.ndarray],
        name: str,
    ) -> tuple[pd.DataFrame, dict, Figure, Figure, Figure]:
        """
        Evaluates the classifier performance on the given x and y data.
        Saves the `metrics_fig` and `pcutoffs_fig` to the model's root directory.

        Returns
        -------
        y_eval : pd.DataFrame
            Predicted behaviour classifications against the true labels.
        metrics_fig : mpl.Figure
            Figure showing the confusion matrix.
        pcutoffs_fig : mpl.Figure
            Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
        logc_fig : mpl.Figure
            Figure showing the logistic curve for different predicted probabilities.
        """
        # Getting predictions
        y_true_ls = [y[index] for y, index in zip(y_ls, index_ls)]
        y_prob_ls = [
            self.clf.predict(x=x, index=index, batch_size=self.configs.batch_size) for x, index in zip(x_ls, index_ls)
        ]
        # Making eval vects
        y_true = np.concatenate(y_true_ls)
        y_prob = np.concatenate(y_prob_ls)
        y_pred = (y_prob > self.configs.pcutoff).astype(int)
        # Making eval_df
        eval_df = BehavPredictedDf.init_df(pd.Series(np.arange(np.concatenate(index_ls).shape[0])))
        eval_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PROB.value)] = y_prob
        eval_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PRED.value)] = y_pred
        eval_df[(self.configs.behav_name, BehavScoredDf.OutcomesCols.ACTUAL.value)] = y_true
        # Making classification report
        report_dict = self.eval_report(y_true, y_pred)
        # Making confusion matrix figure
        metrics_fig = self.eval_conf_matr(y_true, y_pred)
        # Making performance for different pcutoffs figure
        pcutoffs_fig = self.eval_metrics_pcutoffs(y_true, y_prob)
        # Logistic curve
        logc_fig = self.eval_logc(y_true, y_prob)
        # Saving data and figures
        BehavClassifierEvalDf.write(eval_df, os.path.join(self.eval_dir, f"{name}_eval.{BehavClassifierEvalDf.IO}"))
        write_json(os.path.join(self.eval_dir, f"{name}_report.json"), report_dict)
        metrics_fig.savefig(os.path.join(self.eval_dir, f"{name}_confm.png"))
        pcutoffs_fig.savefig(os.path.join(self.eval_dir, f"{name}_pcutoffs.png"))
        logc_fig.savefig(os.path.join(self.eval_dir, f"{name}_logc.png"))
        return eval_df, report_dict, metrics_fig, pcutoffs_fig, logc_fig

    #################################################
    # EVALUATION METRICS FUNCTIONS
    #################################################

    @classmethod
    def eval_report(cls, y_true: np.ndarray, y_pred: np.ndarray) -> dict:
        """
        __summary__
        """
        return classification_report(
            y_true=y_true,
            y_pred=y_pred,
            target_names=enum2tuple(GenericBehavLabels),
            output_dict=True,
        )  # type: ignore

    @classmethod
    def eval_conf_matr(cls, y_true: np.ndarray, y_pred: np.ndarray) -> Figure:
        """
        __summary__
        """
        # Making confusion matrix
        fig, ax = plt.subplots(figsize=(7, 7))
        sns.heatmap(
            confusion_matrix(y_true, y_pred),
            annot=True,
            fmt="d",
            cmap="viridis",
            cbar=False,
            xticklabels=enum2tuple(GenericBehavLabels),
            yticklabels=enum2tuple(GenericBehavLabels),
            ax=ax,
        )
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        return fig

    @classmethod
    def eval_metrics_pcutoffs(cls, y_true: np.ndarray, y_prob: np.ndarray) -> Figure:
        """
        __summary__
        """
        # Getting precision, recall and accuracy for different cutoffs
        pcutoffs = np.linspace(0, 1, 101)
        # Measures
        precisions = np.zeros(pcutoffs.shape[0])
        recalls = np.zeros(pcutoffs.shape[0])
        f1 = np.zeros(pcutoffs.shape[0])
        accuracies = np.zeros(pcutoffs.shape[0])
        for i, pcutoff in enumerate(pcutoffs):
            y_pred = y_prob > pcutoff
            report = classification_report(
                y_true,
                y_pred,
                target_names=enum2tuple(GenericBehavLabels),
                output_dict=True,
            )
            precisions[i] = report[GenericBehavLabels.BEHAV.value]["precision"]  # type: ignore
            recalls[i] = report[GenericBehavLabels.BEHAV.value]["recall"]  # type: ignore
            f1[i] = report[GenericBehavLabels.BEHAV.value]["f1-score"]  # type: ignore
            accuracies[i] = report["accuracy"]  # type: ignore
        # Making figure
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.lineplot(x=pcutoffs, y=precisions, label="precision", ax=ax)
        sns.lineplot(x=pcutoffs, y=recalls, label="recall", ax=ax)
        sns.lineplot(x=pcutoffs, y=f1, label="f1", ax=ax)
        sns.lineplot(x=pcutoffs, y=accuracies, label="accuracy", ax=ax)
        return fig

    @classmethod
    def eval_logc(cls, y_true: np.ndarray, y_prob: np.ndarray) -> Figure:
        """
        __summary__
        """
        y_eval = pd.DataFrame(
            {
                "y_true": y_true,
                "y_prob": y_prob,
                "y_pred": y_prob > 0.4,
                "y_true_jitter": y_true + (0.2 * (np.random.rand(len(y_prob)) - 0.5)),
            }
        )
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.scatterplot(
            data=y_eval,
            x="y_prob",
            y="y_true_jitter",
            marker=".",
            s=10,
            linewidth=0,
            alpha=0.2,
            ax=ax,
        )
        # Making line of ratio of y_true outcomes for each y_prob
        pcutoffs = np.linspace(0, 1, 101)
        ratios = np.vectorize(lambda i: np.mean(i > y_eval["y_prob"]))(pcutoffs)
        sns.lineplot(x=pcutoffs, y=ratios, ax=ax)
        return fig

    @classmethod
    def eval_bouts(cls, y_true: np.ndarray, y_pred: np.ndarray) -> pd.DataFrame:
        """
        __summary__
        """
        y_eval = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})
        y_eval["ids"] = np.cumsum(y_eval["y_true"] != y_eval["y_true"].shift())
        # Getting the proportion of correct predictions for each bout
        y_eval_grouped = y_eval.groupby("ids")
        y_eval_summary = pd.DataFrame(
            y_eval_grouped.apply(lambda x: (x["y_pred"] == x["y_true"]).mean()),
            columns=["proportion"],
        )
        y_eval_summary["actual_bout"] = y_eval_grouped.apply(lambda x: x["y_true"].mean())
        y_eval_summary["bout_len"] = y_eval_grouped.apply(lambda x: x.shape[0])
        y_eval_summary = y_eval_summary.sort_values("proportion")
        # # Making figure
        # fig, ax = plt.subplots(figsize=(10, 7))
        # sns.scatterplot(
        #     data=y_eval_summary,
        #     x="proportion",
        #     y="bout_len",
        #     hue="actual_bout",
        #     alpha=0.4,
        #     marker=".",
        #     s=50,
        #     linewidth=0,
        #     ax=ax,
        # )
        return y_eval_summary

x_dir property

Returns the model's x directory. It gets the features_extracted directory from the parent Behavysis model directory.

y_dir property

Returns the model's y directory. It gets the scored_behavs directory from the parent Behavysis model directory.

_preproc_x_fit_select_cols(x) staticmethod

Selects only the derived features (not the x-y-l columns).

Used in the preprocessing pipeline.

Source code in behavysis/behav_classifier/behav_classifier.py
@staticmethod
def _preproc_x_fit_select_cols(x: np.ndarray) -> np.ndarray:
    """
    Selects only the derived features (not the x-y-l columns).

    Used in the preprocessing pipeline.
    """
    return x[:, 48:]

clf_eval_save_performance(x_ls, y_ls, index_ls, name)

Evaluates the classifier performance on the given x and y data. Saves the metrics_fig and pcutoffs_fig to the model's root directory.

Returns:

Name Type Description
y_eval DataFrame

Predicted behaviour classifications against the true labels.

metrics_fig Figure

Figure showing the confusion matrix.

pcutoffs_fig Figure

Figure showing the precision, recall, f1, and accuracy for different pcutoffs.

logc_fig Figure

Figure showing the logistic curve for different predicted probabilities.

Source code in behavysis/behav_classifier/behav_classifier.py
def clf_eval_save_performance(
    self,
    x_ls: list[np.ndarray],
    y_ls: list[np.ndarray],
    index_ls: list[np.ndarray],
    name: str,
) -> tuple[pd.DataFrame, dict, Figure, Figure, Figure]:
    """
    Evaluates the classifier performance on the given x and y data.
    Saves the `metrics_fig` and `pcutoffs_fig` to the model's root directory.

    Returns
    -------
    y_eval : pd.DataFrame
        Predicted behaviour classifications against the true labels.
    metrics_fig : mpl.Figure
        Figure showing the confusion matrix.
    pcutoffs_fig : mpl.Figure
        Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
    logc_fig : mpl.Figure
        Figure showing the logistic curve for different predicted probabilities.
    """
    # Getting predictions
    y_true_ls = [y[index] for y, index in zip(y_ls, index_ls)]
    y_prob_ls = [
        self.clf.predict(x=x, index=index, batch_size=self.configs.batch_size) for x, index in zip(x_ls, index_ls)
    ]
    # Making eval vects
    y_true = np.concatenate(y_true_ls)
    y_prob = np.concatenate(y_prob_ls)
    y_pred = (y_prob > self.configs.pcutoff).astype(int)
    # Making eval_df
    eval_df = BehavPredictedDf.init_df(pd.Series(np.arange(np.concatenate(index_ls).shape[0])))
    eval_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PROB.value)] = y_prob
    eval_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PRED.value)] = y_pred
    eval_df[(self.configs.behav_name, BehavScoredDf.OutcomesCols.ACTUAL.value)] = y_true
    # Making classification report
    report_dict = self.eval_report(y_true, y_pred)
    # Making confusion matrix figure
    metrics_fig = self.eval_conf_matr(y_true, y_pred)
    # Making performance for different pcutoffs figure
    pcutoffs_fig = self.eval_metrics_pcutoffs(y_true, y_prob)
    # Logistic curve
    logc_fig = self.eval_logc(y_true, y_prob)
    # Saving data and figures
    BehavClassifierEvalDf.write(eval_df, os.path.join(self.eval_dir, f"{name}_eval.{BehavClassifierEvalDf.IO}"))
    write_json(os.path.join(self.eval_dir, f"{name}_report.json"), report_dict)
    metrics_fig.savefig(os.path.join(self.eval_dir, f"{name}_confm.png"))
    pcutoffs_fig.savefig(os.path.join(self.eval_dir, f"{name}_pcutoffs.png"))
    logc_fig.savefig(os.path.join(self.eval_dir, f"{name}_logc.png"))
    return eval_df, report_dict, metrics_fig, pcutoffs_fig, logc_fig

combine_dfs(src_dir) classmethod

Combines the data in the given directory into a single dataframe. Adds a MultiIndex level to the rows, with the values as the filenames in the directory.

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def combine_dfs(cls, src_dir):
    """
    Combines the data in the given directory into a single dataframe.
    Adds a MultiIndex level to the rows, with the values as the filenames in the directory.
    """
    data_dict = {get_name(i): DFMixin.read(os.path.join(src_dir, i)) for i in os.listdir(os.path.join(src_dir))}
    df = pd.concat(data_dict.values(), axis=0, keys=data_dict.keys())
    df = BehavClassifierCombinedDf.basic_clean(df)
    return df

create_from_project(proj) classmethod

Loading classifier from given Behavysis project instance. Wraps the create_from_project_dir method.

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def create_from_project(cls, proj: Project) -> list[BehavClassifier]:
    """
    Loading classifier from given Behavysis project instance.
    Wraps the `create_from_project_dir` method.
    """
    return cls.create_from_project_dir(proj.root_dir)

create_from_project_dir(proj_dir) classmethod

Loading classifier from given Behavysis project directory.

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def create_from_project_dir(cls, proj_dir: str) -> list:
    """
    Loading classifier from given Behavysis project directory.
    """
    # Getting the list of behaviours (after wrangling column names)
    y_df = cls.wrangle_columns_y(cls.combine_dfs(os.path.join(proj_dir, Folders.SCORED_BEHAVS.value)))
    behavs_ls = y_df.columns.to_list()
    # For each behaviour, making a new BehavClassifier instance
    models_ls = [cls(proj_dir, behav) for behav in behavs_ls]
    return models_ls

eval_bouts(y_true, y_pred) classmethod

summary

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def eval_bouts(cls, y_true: np.ndarray, y_pred: np.ndarray) -> pd.DataFrame:
    """
    __summary__
    """
    y_eval = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})
    y_eval["ids"] = np.cumsum(y_eval["y_true"] != y_eval["y_true"].shift())
    # Getting the proportion of correct predictions for each bout
    y_eval_grouped = y_eval.groupby("ids")
    y_eval_summary = pd.DataFrame(
        y_eval_grouped.apply(lambda x: (x["y_pred"] == x["y_true"]).mean()),
        columns=["proportion"],
    )
    y_eval_summary["actual_bout"] = y_eval_grouped.apply(lambda x: x["y_true"].mean())
    y_eval_summary["bout_len"] = y_eval_grouped.apply(lambda x: x.shape[0])
    y_eval_summary = y_eval_summary.sort_values("proportion")
    # # Making figure
    # fig, ax = plt.subplots(figsize=(10, 7))
    # sns.scatterplot(
    #     data=y_eval_summary,
    #     x="proportion",
    #     y="bout_len",
    #     hue="actual_bout",
    #     alpha=0.4,
    #     marker=".",
    #     s=50,
    #     linewidth=0,
    #     ax=ax,
    # )
    return y_eval_summary

eval_conf_matr(y_true, y_pred) classmethod

summary

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def eval_conf_matr(cls, y_true: np.ndarray, y_pred: np.ndarray) -> Figure:
    """
    __summary__
    """
    # Making confusion matrix
    fig, ax = plt.subplots(figsize=(7, 7))
    sns.heatmap(
        confusion_matrix(y_true, y_pred),
        annot=True,
        fmt="d",
        cmap="viridis",
        cbar=False,
        xticklabels=enum2tuple(GenericBehavLabels),
        yticklabels=enum2tuple(GenericBehavLabels),
        ax=ax,
    )
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    return fig

eval_logc(y_true, y_prob) classmethod

summary

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def eval_logc(cls, y_true: np.ndarray, y_prob: np.ndarray) -> Figure:
    """
    __summary__
    """
    y_eval = pd.DataFrame(
        {
            "y_true": y_true,
            "y_prob": y_prob,
            "y_pred": y_prob > 0.4,
            "y_true_jitter": y_true + (0.2 * (np.random.rand(len(y_prob)) - 0.5)),
        }
    )
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.scatterplot(
        data=y_eval,
        x="y_prob",
        y="y_true_jitter",
        marker=".",
        s=10,
        linewidth=0,
        alpha=0.2,
        ax=ax,
    )
    # Making line of ratio of y_true outcomes for each y_prob
    pcutoffs = np.linspace(0, 1, 101)
    ratios = np.vectorize(lambda i: np.mean(i > y_eval["y_prob"]))(pcutoffs)
    sns.lineplot(x=pcutoffs, y=ratios, ax=ax)
    return fig

eval_metrics_pcutoffs(y_true, y_prob) classmethod

summary

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def eval_metrics_pcutoffs(cls, y_true: np.ndarray, y_prob: np.ndarray) -> Figure:
    """
    __summary__
    """
    # Getting precision, recall and accuracy for different cutoffs
    pcutoffs = np.linspace(0, 1, 101)
    # Measures
    precisions = np.zeros(pcutoffs.shape[0])
    recalls = np.zeros(pcutoffs.shape[0])
    f1 = np.zeros(pcutoffs.shape[0])
    accuracies = np.zeros(pcutoffs.shape[0])
    for i, pcutoff in enumerate(pcutoffs):
        y_pred = y_prob > pcutoff
        report = classification_report(
            y_true,
            y_pred,
            target_names=enum2tuple(GenericBehavLabels),
            output_dict=True,
        )
        precisions[i] = report[GenericBehavLabels.BEHAV.value]["precision"]  # type: ignore
        recalls[i] = report[GenericBehavLabels.BEHAV.value]["recall"]  # type: ignore
        f1[i] = report[GenericBehavLabels.BEHAV.value]["f1-score"]  # type: ignore
        accuracies[i] = report["accuracy"]  # type: ignore
    # Making figure
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.lineplot(x=pcutoffs, y=precisions, label="precision", ax=ax)
    sns.lineplot(x=pcutoffs, y=recalls, label="recall", ax=ax)
    sns.lineplot(x=pcutoffs, y=f1, label="f1", ax=ax)
    sns.lineplot(x=pcutoffs, y=accuracies, label="accuracy", ax=ax)
    return fig

eval_report(y_true, y_pred) classmethod

summary

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def eval_report(cls, y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """
    __summary__
    """
    return classification_report(
        y_true=y_true,
        y_pred=y_pred,
        target_names=enum2tuple(GenericBehavLabels),
        output_dict=True,
    )  # type: ignore

load(proj_dir, behav_name) classmethod

Reads the model from the expected model file.

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def load(cls, proj_dir: str, behav_name: str) -> BehavClassifier:
    """
    Reads the model from the expected model file.
    """
    # Checking that the configs file exists and is valid
    configs_fp = os.path.join(proj_dir, "behav_models", behav_name, "configs.json")
    try:
        BehavClassifierConfigs.read_json(configs_fp)
    except (FileNotFoundError, OSError):
        raise ValueError(
            f'Model in project directory, "{proj_dir}", and behav name, "{behav_name}", not found.\n'
            "Please check file path."
        )
    return cls(proj_dir, behav_name)

pipeline_inference(x_df)

Given the unprocessed features dataframe, runs the model pipeline to make predictions.

Pipeline is: - Preprocess x df. Refer to [behavysis.behav_classifier.BehavClassifier.preproc_x][] for details. - Makes predictions and returns the predicted behaviours.

Source code in behavysis/behav_classifier/behav_classifier.py
def pipeline_inference(self, x_df: pd.DataFrame) -> pd.DataFrame:
    """
    Given the unprocessed features dataframe, runs the model pipeline to make predictions.

    Pipeline is:
    - Preprocess `x` df. Refer to
    [behavysis.behav_classifier.BehavClassifier.preproc_x][] for details.
    - Makes predictions and returns the predicted behaviours.
    """
    index = x_df.index
    # Preprocessing features
    x = self.preproc_x_transform(x_df.values, self.preproc_fp)
    # Loading the model
    self.clf = joblib_load(self.clf_fp)
    # Getting probabilities
    y_prob = self.clf.predict(
        x=x,
        index=np.arange(x.shape[0]),
        batch_size=self.configs.batch_size,
    )
    # Making predictions from probabilities (and pcutoff)
    y_pred = (y_prob > self.configs.pcutoff).astype(int)
    # Making df
    pred_df = BehavPredictedDf.init_df(pd.Series(index))
    pred_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PROB.value)] = y_prob
    pred_df[(self.configs.behav_name, BehavPredictedDf.OutcomesCols.PRED.value)] = y_pred
    return pred_df

pipeline_training()

Makes a classifier and saves it to the model's root directory.

Callable is a method from ClfTemplates.

Source code in behavysis/behav_classifier/behav_classifier.py
def pipeline_training(self) -> None:
    """
    Makes a classifier and saves it to the model's root directory.

    Callable is a method from `ClfTemplates`.
    """
    self.logger.info(f"Training {self.configs.clf_struct}")
    # Preparing data
    x_ls, y_ls, index_train_ls, index_test_ls = self.preproc_training()
    # Training the model
    history = self.clf.fit(
        x_ls=x_ls,
        y_ls=y_ls,
        index_ls=index_train_ls,
        batch_size=self.configs.batch_size,
        epochs=self.configs.epochs,
        val_split=self.configs.val_split,
    )
    # Saving history
    self.clf_eval_save_history(history)
    # Evaluating on train and test data
    self.clf_eval_save_performance(x_ls, y_ls, index_train_ls, "train")
    self.clf_eval_save_performance(x_ls, y_ls, index_test_ls, "test")
    # Saving model
    joblib_dump(self.clf, self.clf_fp)

pipeline_training_all()

Making classifier for all available templates.

Source code in behavysis/behav_classifier/behav_classifier.py
def pipeline_training_all(self):
    """
    Making classifier for all available templates.
    """
    # Saving existing clf
    clf = self.clf
    for clf_cls in CLF_TEMPLATES:
        # Initialising the model
        self.clf = clf_cls()
        # Building pipeline, which runs and saves evaluation
        self.pipeline_training()
    # Restoring clf
    self.clf = clf

preproc_training()

Prepares the data for the training pipeline.

Performs the following: - Combining dfs from x and y directories (individual experiment data). - Ensures the x and y dfs have the same index, and are in the same row order. - Preprocesses x df. Refer to preprocess_x for details. - Selects the y class (given in the configs file) from the y df. - Preprocesses y df. Refer to preprocess_y for details. - Splits into training and test indexes. - The training indexes are undersampled to the ratio given in the configs.

Returns:

Type Description
A tuple containing four numpy arrays:
- x_ls: list of each dataframe's input data.
- y_ls: list of each dataframe's target labels.
- index_train_ls: list of each dataframe's indexes for the training data.
- index_test_ls: list of each dataframe's indexes for the testing data.
Source code in behavysis/behav_classifier/behav_classifier.py
def preproc_training(
    self,
) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray]]:
    """
    Prepares the data for the training pipeline.

    Performs the following:
    - Combining dfs from x and y directories (individual experiment data).
    - Ensures the x and y dfs have the same index, and are in the same row order.
    - Preprocesses x df. Refer to `preprocess_x` for details.
    - Selects the y class (given in the configs file) from the y df.
    - Preprocesses y df. Refer to `preprocess_y` for details.
    - Splits into training and test indexes.
        - The training indexes are undersampled to the ratio given in the configs.

    Returns
    -------
    A tuple containing four numpy arrays:
    - x_ls: list of each dataframe's input data.
    - y_ls: list of each dataframe's target labels.
    - index_train_ls: list of each dataframe's indexes for the training data.
    - index_test_ls: list of each dataframe's indexes for the testing data.
    """
    # Getting the lists of x and y dfs
    x_fp_ls = [os.path.join(self.x_dir, i) for i in os.listdir(os.path.join(self.x_dir))]
    y_fp_ls = [os.path.join(self.y_dir, i) for i in os.listdir(os.path.join(self.y_dir))]
    x_df_ls = async_read_files_run(x_fp_ls, FeaturesDf.read)
    y_df_ls = async_read_files_run(y_fp_ls, BehavScoredDf.read)
    # Formatting y dfs (selecting column and replacing UNDETERMINED with NON_BEHAV values)
    y_df_ls = [
        y[(self.configs.behav_name, BehavScoredDf.OutcomesCols.ACTUAL.value)].replace(
            BehavValues.UNDETERMINED.value, BehavValues.NON_BEHAV.value
        )
        for y in y_df_ls
    ]
    # Ensuring x and y dfs have the same index and are in the same row order
    index_df_ls = [x.index.intersection(y.index) for x, y in zip(x_df_ls, y_df_ls)]
    x_df_ls = [x.loc[index] for x, index in zip(x_df_ls, index_df_ls)]
    y_df_ls = [y.loc[index] for y, index in zip(y_df_ls, index_df_ls)]
    assert np.all([x.shape[0] == y.shape[0] for x, y in zip(x_df_ls, y_df_ls)])
    # Converting to numpy arrays
    x_ls = [x.values for x in x_df_ls]
    y_ls = [y.values for y in y_df_ls]
    index_ls = [np.arange(x.shape[0]) for x in x_ls]
    # x preprocessing: fitting (across all x dfs) and transforming (for each x df)
    self.preproc_x_fit(np.concatenate(x_ls, axis=0), self.preproc_fp)
    x_ls = [self.preproc_x_transform(x, self.preproc_fp) for x in x_ls]
    # Making a 2D array of (df_index, index, y) for train-test splitting, stratifying and sampling
    index_flat = listofvects2array(index_ls, y_ls)
    # Splitting into train and test indexes
    index_train_flat, index_test_flat = train_test_split(
        index_flat,
        test_size=self.configs.test_split,
        stratify=index_flat[:, 2],
    )
    # Oversampling and undersampling ONLY on training data
    index_train_flat = self.oversample(index_train_flat, index_train_flat[:, 2], self.configs.oversample_ratio)
    index_train_flat = self.undersample(index_train_flat, index_train_flat[:, 2], self.configs.undersample_ratio)
    # Reshaping back to individual df index lists
    index_train_ls = array2listofvect(index_train_flat, 1)
    index_test_ls = array2listofvect(index_test_flat, 1)
    return x_ls, y_ls, index_train_ls, index_test_ls

preproc_x_fit(x, preproc_fp) classmethod

The preprocessing steps are: - Select only the derived features (not the x-y-l columns) - 2 (indivs) * 8 (bpts) * 3 (coords) = 48 (columns) before derived features - MinMax scaling (using previously fitted MinMaxScaler)

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def preproc_x_fit(cls, x: np.ndarray, preproc_fp: str) -> None:
    """
    The preprocessing steps are:
    - Select only the derived features (not the x-y-l columns)
        - 2 (indivs) * 8 (bpts) * 3 (coords) = 48 (columns) before derived features
    - MinMax scaling (using previously fitted MinMaxScaler)
    """
    preproc_pipe = Pipeline(
        steps=[
            ("select_columns", FunctionTransformer(cls._preproc_x_fit_select_cols)),
            ("min_max_scaler", MinMaxScaler()),
        ]
    )
    preproc_pipe.fit(x)
    joblib_dump(preproc_pipe, preproc_fp)

preproc_x_transform(x, preproc_fp) classmethod

Runs the preprocessing steps fitted from preproc_x_fit on the given x data.

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def preproc_x_transform(cls, x: np.ndarray, preproc_fp: str) -> np.ndarray:
    """
    Runs the preprocessing steps fitted from `preproc_x_fit` on the given `x` data.
    """
    preproc_pipe: Pipeline = joblib_load(preproc_fp)
    x_preproc = preproc_pipe.transform(x)
    return x_preproc

wrangle_columns_y(y) classmethod

Filters the y dataframe to only include the behav column and the specific outcome columns, and rename the columns to be in the format {behav}__{outcome}.

Source code in behavysis/behav_classifier/behav_classifier.py
@classmethod
def wrangle_columns_y(cls, y: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the `y` dataframe to only include the `behav` column and the specific outcome columns,
    and rename the columns to be in the format `{behav}__{outcome}`.
    """
    # Filtering out the pred columns (in the `outcomes` level)
    columns_filter = np.isin(
        y.columns.get_level_values(BehavScoredDf.CN.OUTCOMES.value),
        [BehavScoredDf.OutcomesCols.PRED.value],
        invert=True,
    )
    y = y.loc[:, columns_filter]
    # Setting the column names from `(behav, outcome)` to `{behav}__{outcome}`
    y.columns = [
        f"{behav_name}"
        if outcome_name == BehavScoredDf.OutcomesCols.ACTUAL.value
        else f"{behav_name}__{outcome_name}"
        for behav_name, outcome_name in y.columns
    ]
    return y

behavysis.pydantic_models.behav_classifier_configs.BehavClassifierConfigs

Bases: PydanticBaseModel

Source code in behavysis/pydantic_models/behav_classifier_configs.py
class BehavClassifierConfigs(PydanticBaseModel):
    proj_dir: str = "project_dir"
    behav_name: str = "behav_name"
    seed: int = 42
    oversample_ratio: float = 0.2
    undersample_ratio: float = 0.4

    clf_struct: str = "clf"  # Classifier type (defined in ClfTemplates)
    pcutoff: float = 0.2
    test_split: float = 0.2
    val_split: float = 0.2
    batch_size: int = 256
    epochs: int = 100