Skip to content

Behav classifier

behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifier

BehavClassifier abstract class peforms behav classifier model preparation, training, saving, evaluation, and inference.

Attributes:

Name Type Description
configs_fp str

description

clf BaseTorchModel

description

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
class BehavClassifier:
    """
    BehavClassifier abstract class peforms behav classifier model preparation, training, saving,
    evaluation, and inference.

    Attributes
    ----------
    configs_fp
        _description_
    clf
        _description_
    """

    configs_fp: str
    clf: BaseTorchModel

    def __init__(self, configs_fp: str) -> None:
        """
        Make a BehavClassifier instance.

        Parameters
        ----------
        configs_fp :
            _description_
        """
        # Storing configs json fp
        self.configs_fp = configs_fp
        self.clf = None
        # Trying to read in configs json. Making a new one if it doesn't exist
        try:
            configs = BehavClassifierConfigs.read_json(self.configs_fp)
            logging.info("Reading existing model configs")
        except FileNotFoundError:
            configs = BehavClassifierConfigs()
            logging.info("Making new model configs")
        # Saving configs
        configs.write_json(self.configs_fp)

    #################################################
    # CREATE MODEL METHODS
    #################################################

    @classmethod
    def create_from_project(cls, proj: Project) -> list[BehavClassifier]:
        """
        Loading classifier from given Project instance.

        Parameters
        ----------
        proj :
            The Project instance.

        Returns
        -------
        :
            The loaded BehavClassifier instance.
        """
        # Getting the list of behaviours
        y_df = cls.wrangle_columns_y(
            cls.combine(os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value))
        )
        # For each behaviour, making a new BehavClassifier instance
        behavs_ls = y_df.columns.to_list()
        models_dir = os.path.join(proj.root_dir, BEHAV_MODELS_SUBDIR)
        models_ls = [cls.create_new_model(models_dir, behav) for behav in behavs_ls]
        # Importing data from project to "beham_models" folder (only need one model for this)
        if len(models_ls) > 0:
            models_ls[0].import_data(
                os.path.join(proj.root_dir, Folders.FEATURES_EXTRACTED.value),
                os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value),
                False,
            )
        return models_ls

    @classmethod
    def create_new_model(cls, root_dir: str, behaviour_name: str) -> BehavClassifier:
        """
        Creating a new BehavClassifier model in the given directory
        """
        configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
        # Making new BehavClassifier instance
        inst = cls(configs_fp)
        # Updating configs with project data
        configs = inst.configs
        configs.behaviour_name = behaviour_name
        configs.write_json(inst.configs_fp)
        # Returning model
        return inst

    def create_from_model(self, root_dir: str, behaviour_name: str) -> BehavClassifier:
        """
        Creating a new BehavClassifier model in the given directory
        """
        configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
        # Making new BehavClassifier instance
        inst = self.create_new_model(configs_fp, behaviour_name)
        # Using current instance's configs (but using given behaviour_name)
        configs = self.configs
        configs.behaviour_name = behaviour_name
        configs.write_json(inst.configs_fp)
        # Returning model
        return inst

    #################################################
    #            READING MODEL
    #################################################

    @classmethod
    def load(cls, configs_fp: str) -> BehavClassifier:
        """
        Reads the model from the expected model file.
        """
        if not os.path.isfile(configs_fp):
            raise FileNotFoundError(f"The model file does not exist: {configs_fp}")
        return cls(configs_fp)

    #################################################
    #            GETTER AND SETTERS
    #################################################

    @property
    def configs(self) -> BehavClassifierConfigs:
        """Returns the config model from the expected config file."""
        return BehavClassifierConfigs.read_json(self.configs_fp)

    @property
    def root_dir(self) -> str:
        """Returns the model's root directory"""
        return os.path.dirname(self.configs_fp)

    @property
    def clf_fp(self) -> str:
        """Returns the model's filepath"""
        path = os.path.join(self.root_dir, self.configs.behaviour_name, "model.sav")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path

    @property
    def preproc_fp(self) -> str:
        """Returns the model's preprocessor filepath"""
        path = os.path.join(self.root_dir, self.configs.behaviour_name, "preproc.sav")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path

    @property
    def eval_dir(self) -> str:
        """Returns the model's evaluation directory"""
        path = os.path.join(self.root_dir, self.configs.behaviour_name, "eval")
        os.makedirs(path, exist_ok=True)
        return path

    #################################################
    #            IMPORTING DATA TO MODEL
    #################################################

    def import_data(self, x_dir: str, y_dir: str, overwrite=False) -> None:
        """
        Importing data from extracted features and labelled behaviours dataframes.

        Parameters
        ----------
        x_dir :
            _description_
        y_dir :
            _description_
        """
        # For each x and y directory
        for in_dir, id in ((x_dir, X_ID), (y_dir, Y_ID)):
            out_dir = os.path.join(self.root_dir, id)
            os.makedirs(out_dir, exist_ok=True)
            # Copying each file to model root directory
            for fp in os.listdir(in_dir):
                in_fp = os.path.join(in_dir, fp)
                out_fp = os.path.join(out_dir, fp)
                # If not overwriting and out file already exists, then skip
                if not overwrite and os.path.exists(out_fp):
                    continue
                # Copying file
                shutil.copyfile(in_fp, out_fp)

    #################################################
    #            COMBINING DFS TO SINGLE DF
    #################################################

    @staticmethod
    def combine(src_dir):
        data_dict = {
            os.path.splitext(i)[0]: pd.read_feather(os.path.join(src_dir, i))
            for i in os.listdir(os.path.join(src_dir))
        }
        return pd.concat(data_dict.values(), axis=0, keys=data_dict.keys())

    def combine_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Combines the data into a single `X` df, `y` df, and index.
        The indexes of `x` and `y` will be the same (with an inner join)

        Returns
        -------
        x :
            Features dataframe of all experiments in the `x` directory
        y :
            Outcomes dataframe of all experiments in the `y` directory
        """
        # Getting the x and y dfs
        x = BehavClassifier.combine(os.path.join(self.root_dir, X_ID))
        y = BehavClassifier.combine(os.path.join(self.root_dir, Y_ID))
        # Getting the intersection pf the x and y row indexes
        index = x.index.intersection(y.index)
        x = x.loc[index]
        y = y.loc[index]
        # Assert that x and y are the same length
        assert x.shape[0] == y.shape[0]
        # Returning the x and y dfs
        return x, y

    #################################################
    #            PREPROCESSING DFS
    #################################################

    @staticmethod
    def preproc_x_fit(x: np.ndarray, preproc_fp: str) -> None:
        """
        __summary__
        """
        # Making pipeline
        preproc_pipe = Pipeline(steps=[("MinMaxScaler", MinMaxScaler())])
        # Fitting pipeline
        preproc_pipe.fit(x)
        # Saving pipeline
        joblib.dump(preproc_pipe, preproc_fp)

    @staticmethod
    def preproc_x(x: np.ndarray, preproc_fp: str) -> np.ndarray:
        """
        The preprocessing steps are:
        - MinMax scaling (using previously fitted MinMaxScaler)
        """
        # Loading in pipeline
        preproc_pipe = joblib.load(preproc_fp)
        # Uses trained fit for preprocessing new data
        x = preproc_pipe.transform(x)
        # Returning df
        return x

    @staticmethod
    def wrangle_columns_y(y: pd.DataFrame) -> pd.DataFrame:
        """
        _summary_

        Parameters
        ----------
        y :
            _description_

        Returns
        -------
        :
            _description_
        """
        # Filtering out the prob and pred columns (in the `outcomes` level)
        cols_filter = np.isin(
            y.columns.get_level_values(BehavCN.OUTCOMES.value),
            [BehavColumns.PROB.value, BehavColumns.PRED.value],
            invert=True,
        )
        y = y.loc[:, cols_filter]
        # Converting MultiIndex columns to single columns by
        # setting the column names from `(behav, outcome)` to `{behav}__{outcome}`
        y.columns = [
            f"{i[0]}" if i[1] == BehavColumns.ACTUAL.value else f"{i[0]}__{i[1]}"
            for i in y.columns
        ]
        return y

    @staticmethod
    def preproc_y(y: np.ndarray) -> np.ndarray:
        """
        The preprocessing steps are:
        - Imputing NaN values with 0
        - Setting -1 to 0
        - Converting the MultiIndex columns from `(behav, outcome)` to `{behav}__{outcome}`,
        by expanding the `actual` and all specific outcome columns of each behav.
        """
        # Imputing NaN values with 0
        y = np.nan_to_num(y, nan=0)
        # Setting -1 to 0 (i.e. "undecided" to "no behaviour")
        y = np.maximum(y, 0)
        # Returning arr
        return y

    @staticmethod
    def undersample(index: np.ndarray, y: np.ndarray, ratio: float) -> np.ndarray:
        # Assert that index and y are the same length
        assert index.shape[0] == y.shape[0]
        # Getting array of True indices
        t = index[y == 1]
        # Getting array of False indices
        f = index[y == 0]
        # Undersampling the False indices
        f = np.random.choice(f, size=int(t.shape[0] / ratio), replace=False)
        # Combining the True and False indices
        uindex = np.union1d(t, f)
        # Returning the undersampled index
        return uindex

    #################################################
    #            PIPELINE FOR DATA PREP
    #################################################

    def prepare_data_training(self) -> tuple[np.ndarray, np.ndarray]:
        """
        Prepares the data (`x` and `y`) in the model for training.
        Data is taken from the model's `x` and `y` dirs.

        Performs the following:
        - Combining dfs from x and y directories (individual experiment data)
        - Ensures the x and y dfs have the same index, and are in the same row order
        - Preprocesses x df. Refer to `preprocess_x` for details.
        - Selects the y class (given in the configs file) from the y df.
        - Preprocesses y df. Refer to `preprocess_y` for details.

        Returns
        -------
        x : np.ndarray
            Features array in the format: `(samples, window, features)`
        y : np.ndarray
            Outcomes array in the format: `(samples, class)`
        """
        # Combining dfs from x and y directories (individual experiment data)
        x, y = self.combine_dfs()
        # Fitting the preprocessor pipeline
        self.preproc_x_fit(x, self.preproc_fp)
        # Preprocessing x df
        x = self.preproc_x(x, self.preproc_fp)
        # Preprocessing y df
        y = self.wrangle_columns_y(y)[self.configs.behaviour_name].values
        y = self.preproc_y(y)
        # Returning x, y, and index to use
        return x, y

    def prepare_data_training_pipeline(
        self,
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepares the data for the training pipeline.

        Performs the following:
        - Preprocesses `x` and `y` data. Refer to `prepare_data_training` for details.
        - Splits into training and test indexes.
            - The training indexes are undersampled to the ratio given in the configs.

        Returns:
            A tuple containing four numpy arrays:
            - x: The input data.
            - y: The target labels.
            - ind_train: The indexes for the training data.
            - ind_test: The indexes for the testing data.
        """
        # Preparing data
        x, y = self.prepare_data_training()
        # Getting entire index
        index = np.arange(x.shape[0])
        # Splitting into train and test indexes
        ind_train, ind_test = train_test_split(
            index,
            test_size=self.configs.test_split,
            stratify=y[index],
        )
        # Undersampling training index
        ind_train = self.undersample(
            ind_train, y[ind_train], self.configs.undersample_ratio
        )
        # Return
        return x, y, ind_train, ind_test

    def prepare_data(self, x: pd.DataFrame) -> np.ndarray:
        """
        Prepares novel (`x` only) data, given the `x` pd.DataFrame.

        Performs the following:
        - Preprocesses x df. Refer to `preprocess_x` for details.
        - Makes the X windowed array, for each index.

        Returns
        -------
        x : np.ndarray
            Features array in the format: `(samples, window, features)`
        """
        # Preprocessing x df
        x = self.preproc_x(x, self.preproc_fp)
        # Returning x
        return x

    #################################################
    # PIPELINE FOR CLASSIFIER TRAINING AND INFERENCE
    #################################################

    def pipeline_build(self, clf_init_f: Callable) -> None:
        """
        Makes a classifier and saves it to the model's root directory.

        Callable is a method from `ClfTemplates`.
        """
        # Preparing data
        x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
        # Initialising the model
        self.clf = clf_init_f()
        # Training the model
        history = self.clf.fit(
            x=x,
            y=y,
            index=ind_train,
            batch_size=self.configs.batch_size,
            epochs=self.configs.epochs,
            val_split=self.configs.val_split,
        )
        # Saving history
        self.clf_eval_save_history(history)
        # Evaluating the model
        self.clf_eval(x, y, ind_test)
        # Updating the model configs
        configs = self.configs
        configs.clf_structure = clf_init_f.__name__
        configs.write_json(self.configs_fp)
        # Saving the model to disk
        self.clf_save()

    def pipeline_run(self, x: pd.DataFrame) -> pd.DataFrame:
        """
        Given the unprocessed features dataframe, runs the model pipeline to make predictions.

        Pipeline is:
        - Preprocess `x` df. Refer to
        [behavysis_pipeline.behav_classifier.BehavClassifier.preproc_x][] for details.
        - Makes predictions and returns the predicted behaviours.
        """
        # Saving index for later
        index = x.index
        # Preprocessing features
        x = self.prepare_data(x)
        # Loading the model
        self.clf_load()
        # Making predictions
        y_eval = self.clf_predict(x, self.configs.batch_size)
        # Settings the index
        y_eval.index = index
        # Returning predictions
        return y_eval

    #################################################
    # MODEL CLASSIFIER METHODS
    #################################################

    def clf_load(self):
        """
        Loads the model stored in `<root_dir>/<behav_name>.sav` to the model attribute.
        """
        self.clf = joblib.load(self.clf_fp)

    def clf_save(self):
        """
        Saves the model's classifier to `<root_dir>/<behav_name>.sav`.
        """
        joblib.dump(self.clf, self.clf_fp)

    def clf_predict(
        self,
        x: np.ndarray,
        batch_size: int,
        index: Optional[np.ndarray] = None,
    ) -> pd.DataFrame:
        """
        Making predictions using the given model and preprocessed features.
        Assumes the x array is already preprocessed.

        Parameters
        ----------
        x : np.ndarray
            Preprocessed features.

        Returns
        -------
        pd.DataFrame
            Predicted behaviour classifications. Dataframe columns are in the format:
            ```
            behaviours :  behav    behav
            outcomes   :  "prob"   "pred"
            ```
        """
        # Getting probabilities
        index = np.arange(x.shape[0]) if index is None else index
        y_probs = self.clf.predict(
            x=x,
            index=index,
            batch_size=batch_size,
        )
        # Making predictions from probabilities (and pcutoff)
        y_preds = (y_probs > self.configs.pcutoff).astype(int)
        # Making df
        pred_df = BehavMixin.init_df(index)
        pred_df[(self.configs.behaviour_name, BehavColumns.PROB.value)] = y_probs
        pred_df[(self.configs.behaviour_name, BehavColumns.PRED.value)] = y_preds
        # Returning predicted behavs
        return pred_df

    #################################################
    # COMPREHENSIVE EVALUATION FUNCTIONS
    #################################################

    def clf_eval_save_history(self, history: pd.DataFrame, name: Optional[str] = ""):
        # Saving history df
        DFIOMixin.write_feather(
            history, os.path.join(self.eval_dir, f"{name}_history.feather")
        )
        # Making and saving history figure
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.lineplot(data=history, ax=ax)
        fig.savefig(os.path.join(self.eval_dir, f"{name}_history.png"))

    def clf_eval(
        self,
        x: np.ndarray,
        y: np.ndarray,
        index: Optional[np.ndarray] = None,
        name: Optional[str] = "",
    ) -> tuple[pd.DataFrame, dict, plt.Figure, plt.Figure, plt.Figure]:
        """
        Evaluates the classifier performance on the given x and y data.
        Saves the `metrics_fig` and `pcutoffs_fig` to the model's root directory.

        Returns
        -------
        y_eval : pd.DataFrame
            Predicted behaviour classifications against the true labels.
        metrics_fig : mpl.Figure
            Figure showing the confusion matrix.
        pcutoffs_fig : mpl.Figure
            Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
        logc_fig : mpl.Figure
            Figure showing the logistic curve for different predicted probabilities.
        """
        # Making eval df
        index = np.arange(x.shape[0]) if index is None else index
        y_eval = self.clf_predict(x=x, index=index, batch_size=self.configs.batch_size)
        # Including `actual` lables in `y_eval`
        y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value] = y[index]
        # Getting individual columns
        y_prob = y_eval[self.configs.behaviour_name, BehavColumns.PROB.value]
        y_pred = y_eval[self.configs.behaviour_name, BehavColumns.PRED.value]
        y_true = y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value]
        # Making classification report
        report_dict = self.eval_report(y_true, y_pred)
        # Making confusion matrix figure
        metrics_fig = self.eval_conf_matr(y_true, y_pred)
        # Making performance for different pcutoffs figure
        pcutoffs_fig = self.eval_metrics_pcutoffs(y_true, y_prob)
        # Logistic curve
        logc_fig = self.eval_logc(y_true, y_prob)
        # Saving data and figures
        DFIOMixin.write_feather(
            y_eval, os.path.join(self.eval_dir, f"{name}_eval.feather")
        )
        with open(os.path.join(self.eval_dir, f"{name}_report.json"), "w") as f:
            json.dump(report_dict, f)
        metrics_fig.savefig(os.path.join(self.eval_dir, f"{name}_confm.png"))
        pcutoffs_fig.savefig(os.path.join(self.eval_dir, f"{name}_pcutoffs.png"))
        logc_fig.savefig(os.path.join(self.eval_dir, f"{name}_logc.png"))
        # Print classification report
        print(json.dumps(report_dict, indent=4))
        # Return evaluations
        return y_eval, report_dict, metrics_fig, pcutoffs_fig, logc_fig

    def clf_eval_compare_all(self):
        """
        Making classifier for all available templates.

        Notes
        -----
        Takes a long time to run.
        """
        # Saving existing clf
        clf = self.clf
        # Preparing data
        x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
        # # Adding noise (TODO: use with augmentation)
        # noise = 0.05
        # x_train += np.random.normal(0, noise, x_train.shape)
        # x_test += np.random.normal(0, noise, x_test.shape)
        # Getting eval for each classifier in ClfTemplates
        for clf_init_f in CLF_TEMPLATES:
            clf_name = clf_init_f.__name__
            # Making classifier
            self.clf = clf_init_f()
            # Training
            history = self.clf.fit(
                x=x,
                y=y,
                index=ind_train,
                batch_size=self.configs.batch_size,
                epochs=self.configs.epochs,
                val_split=self.configs.val_split,
            )
            # Saving history
            self.clf_eval_save_history(history, name=clf_name)
            # Evaluating on train and test data
            self.clf_eval(x, y, index=ind_train, name=f"{clf_name}_train")
            self.clf_eval(x, y, index=ind_test, name=f"{clf_name}_test")
        # Restoring clf
        self.clf = clf

    #################################################
    # EVALUATION METRICS FUNCTIONS
    #################################################

    @staticmethod
    def eval_report(y_true: pd.Series, y_pred: pd.Series) -> dict:
        """
        __summary__
        """
        return classification_report(
            y_true,
            y_pred,
            target_names=GENERIC_BEHAV_LABELS,
            output_dict=True,
        )

    @staticmethod
    def eval_conf_matr(y_true: pd.Series, y_pred: pd.Series) -> plt.Figure:
        """
        __summary__
        """
        # Making confusion matrix
        fig, ax = plt.subplots(figsize=(7, 7))
        sns.heatmap(
            confusion_matrix(y_true, y_pred),
            annot=True,
            fmt="d",
            cmap="viridis",
            cbar=False,
            xticklabels=GENERIC_BEHAV_LABELS,
            yticklabels=GENERIC_BEHAV_LABELS,
            ax=ax,
        )
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        return fig

    @staticmethod
    def eval_metrics_pcutoffs(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
        """
        __summary__
        """
        # Getting precision, recall and accuracy for different cutoffs
        pcutoffs = np.linspace(0, 1, 101)
        # Measures
        precisions = np.zeros(pcutoffs.shape[0])
        recalls = np.zeros(pcutoffs.shape[0])
        f1 = np.zeros(pcutoffs.shape[0])
        accuracies = np.zeros(pcutoffs.shape[0])
        for i, pcutoff in enumerate(pcutoffs):
            y_pred = y_prob > pcutoff
            report = classification_report(
                y_true,
                y_pred,
                target_names=GENERIC_BEHAV_LABELS,
                output_dict=True,
            )
            precisions[i] = report[GENERIC_BEHAV_LABELS[1]]["precision"]
            recalls[i] = report[GENERIC_BEHAV_LABELS[1]]["recall"]
            f1[i] = report[GENERIC_BEHAV_LABELS[1]]["f1-score"]
            accuracies[i] = report["accuracy"]
        # Making figure
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.lineplot(x=pcutoffs, y=precisions, label="precision", ax=ax)
        sns.lineplot(x=pcutoffs, y=recalls, label="recall", ax=ax)
        sns.lineplot(x=pcutoffs, y=f1, label="f1", ax=ax)
        sns.lineplot(x=pcutoffs, y=accuracies, label="accuracy", ax=ax)
        return fig

    @staticmethod
    def eval_logc(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
        """
        __summary__
        """
        y_eval = pd.DataFrame(
            {
                "y_true": y_true,
                "y_prob": y_prob,
                "y_pred": y_prob > 0.4,
                "y_true_jitter": y_true + (0.2 * (np.random.rand(len(y_prob)) - 0.5)),
            }
        )
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.scatterplot(
            data=y_eval,
            x="y_prob",
            y="y_true_jitter",
            marker=".",
            s=10,
            linewidth=0,
            alpha=0.2,
            ax=ax,
        )
        # Making line of ratio of y_true outcomes for each y_prob
        pcutoffs = np.linspace(0, 1, 101)
        ratios = np.vectorize(lambda i: np.mean(i > y_eval["y_prob"]))(pcutoffs)
        sns.lineplot(x=pcutoffs, y=ratios, ax=ax)
        # Returning figure
        return fig

    @staticmethod
    def eval_bouts(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame:
        """
        __summary__
        """
        y_eval = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})
        y_eval["ids"] = np.cumsum(y_eval["y_true"] != y_eval["y_true"].shift())
        # Getting the proportion of correct predictions for each bout
        y_eval_grouped = y_eval.groupby("ids")
        y_eval_summary = pd.DataFrame(
            y_eval_grouped.apply(lambda x: (x["y_pred"] == x["y_true"]).mean()),
            columns=["proportion"],
        )
        y_eval_summary["actual_bout"] = y_eval_grouped.apply(
            lambda x: x["y_true"].mean()
        )
        y_eval_summary["bout_len"] = y_eval_grouped.apply(lambda x: x.shape[0])
        y_eval_summary = y_eval_summary.sort_values("proportion")
        # # Making figure
        # fig, ax = plt.subplots(figsize=(10, 7))
        # sns.scatterplot(
        #     data=y_eval_summary,
        #     x="proportion",
        #     y="bout_len",
        #     hue="actual_bout",
        #     alpha=0.4,
        #     marker=".",
        #     s=50,
        #     linewidth=0,
        #     ax=ax,
        # )
        return y_eval_summary

clf_fp: str property

Returns the model's filepath

configs: BehavClassifierConfigs property

Returns the config model from the expected config file.

eval_dir: str property

Returns the model's evaluation directory

preproc_fp: str property

Returns the model's preprocessor filepath

root_dir: str property

Returns the model's root directory

__init__(configs_fp)

Make a BehavClassifier instance.

Parameters:

Name Type Description Default
configs_fp str

description

required
Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def __init__(self, configs_fp: str) -> None:
    """
    Make a BehavClassifier instance.

    Parameters
    ----------
    configs_fp :
        _description_
    """
    # Storing configs json fp
    self.configs_fp = configs_fp
    self.clf = None
    # Trying to read in configs json. Making a new one if it doesn't exist
    try:
        configs = BehavClassifierConfigs.read_json(self.configs_fp)
        logging.info("Reading existing model configs")
    except FileNotFoundError:
        configs = BehavClassifierConfigs()
        logging.info("Making new model configs")
    # Saving configs
    configs.write_json(self.configs_fp)

clf_eval(x, y, index=None, name='')

Evaluates the classifier performance on the given x and y data. Saves the metrics_fig and pcutoffs_fig to the model's root directory.

Returns:

Name Type Description
y_eval DataFrame

Predicted behaviour classifications against the true labels.

metrics_fig Figure

Figure showing the confusion matrix.

pcutoffs_fig Figure

Figure showing the precision, recall, f1, and accuracy for different pcutoffs.

logc_fig Figure

Figure showing the logistic curve for different predicted probabilities.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def clf_eval(
    self,
    x: np.ndarray,
    y: np.ndarray,
    index: Optional[np.ndarray] = None,
    name: Optional[str] = "",
) -> tuple[pd.DataFrame, dict, plt.Figure, plt.Figure, plt.Figure]:
    """
    Evaluates the classifier performance on the given x and y data.
    Saves the `metrics_fig` and `pcutoffs_fig` to the model's root directory.

    Returns
    -------
    y_eval : pd.DataFrame
        Predicted behaviour classifications against the true labels.
    metrics_fig : mpl.Figure
        Figure showing the confusion matrix.
    pcutoffs_fig : mpl.Figure
        Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
    logc_fig : mpl.Figure
        Figure showing the logistic curve for different predicted probabilities.
    """
    # Making eval df
    index = np.arange(x.shape[0]) if index is None else index
    y_eval = self.clf_predict(x=x, index=index, batch_size=self.configs.batch_size)
    # Including `actual` lables in `y_eval`
    y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value] = y[index]
    # Getting individual columns
    y_prob = y_eval[self.configs.behaviour_name, BehavColumns.PROB.value]
    y_pred = y_eval[self.configs.behaviour_name, BehavColumns.PRED.value]
    y_true = y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value]
    # Making classification report
    report_dict = self.eval_report(y_true, y_pred)
    # Making confusion matrix figure
    metrics_fig = self.eval_conf_matr(y_true, y_pred)
    # Making performance for different pcutoffs figure
    pcutoffs_fig = self.eval_metrics_pcutoffs(y_true, y_prob)
    # Logistic curve
    logc_fig = self.eval_logc(y_true, y_prob)
    # Saving data and figures
    DFIOMixin.write_feather(
        y_eval, os.path.join(self.eval_dir, f"{name}_eval.feather")
    )
    with open(os.path.join(self.eval_dir, f"{name}_report.json"), "w") as f:
        json.dump(report_dict, f)
    metrics_fig.savefig(os.path.join(self.eval_dir, f"{name}_confm.png"))
    pcutoffs_fig.savefig(os.path.join(self.eval_dir, f"{name}_pcutoffs.png"))
    logc_fig.savefig(os.path.join(self.eval_dir, f"{name}_logc.png"))
    # Print classification report
    print(json.dumps(report_dict, indent=4))
    # Return evaluations
    return y_eval, report_dict, metrics_fig, pcutoffs_fig, logc_fig

clf_eval_compare_all()

Making classifier for all available templates.

Notes

Takes a long time to run.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def clf_eval_compare_all(self):
    """
    Making classifier for all available templates.

    Notes
    -----
    Takes a long time to run.
    """
    # Saving existing clf
    clf = self.clf
    # Preparing data
    x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
    # # Adding noise (TODO: use with augmentation)
    # noise = 0.05
    # x_train += np.random.normal(0, noise, x_train.shape)
    # x_test += np.random.normal(0, noise, x_test.shape)
    # Getting eval for each classifier in ClfTemplates
    for clf_init_f in CLF_TEMPLATES:
        clf_name = clf_init_f.__name__
        # Making classifier
        self.clf = clf_init_f()
        # Training
        history = self.clf.fit(
            x=x,
            y=y,
            index=ind_train,
            batch_size=self.configs.batch_size,
            epochs=self.configs.epochs,
            val_split=self.configs.val_split,
        )
        # Saving history
        self.clf_eval_save_history(history, name=clf_name)
        # Evaluating on train and test data
        self.clf_eval(x, y, index=ind_train, name=f"{clf_name}_train")
        self.clf_eval(x, y, index=ind_test, name=f"{clf_name}_test")
    # Restoring clf
    self.clf = clf

clf_load()

Loads the model stored in <root_dir>/<behav_name>.sav to the model attribute.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def clf_load(self):
    """
    Loads the model stored in `<root_dir>/<behav_name>.sav` to the model attribute.
    """
    self.clf = joblib.load(self.clf_fp)

clf_predict(x, batch_size, index=None)

Making predictions using the given model and preprocessed features. Assumes the x array is already preprocessed.

Parameters:

Name Type Description Default
x ndarray

Preprocessed features.

required

Returns:

Type Description
DataFrame

Predicted behaviour classifications. Dataframe columns are in the format:

behaviours :  behav    behav
outcomes   :  "prob"   "pred"

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def clf_predict(
    self,
    x: np.ndarray,
    batch_size: int,
    index: Optional[np.ndarray] = None,
) -> pd.DataFrame:
    """
    Making predictions using the given model and preprocessed features.
    Assumes the x array is already preprocessed.

    Parameters
    ----------
    x : np.ndarray
        Preprocessed features.

    Returns
    -------
    pd.DataFrame
        Predicted behaviour classifications. Dataframe columns are in the format:
        ```
        behaviours :  behav    behav
        outcomes   :  "prob"   "pred"
        ```
    """
    # Getting probabilities
    index = np.arange(x.shape[0]) if index is None else index
    y_probs = self.clf.predict(
        x=x,
        index=index,
        batch_size=batch_size,
    )
    # Making predictions from probabilities (and pcutoff)
    y_preds = (y_probs > self.configs.pcutoff).astype(int)
    # Making df
    pred_df = BehavMixin.init_df(index)
    pred_df[(self.configs.behaviour_name, BehavColumns.PROB.value)] = y_probs
    pred_df[(self.configs.behaviour_name, BehavColumns.PRED.value)] = y_preds
    # Returning predicted behavs
    return pred_df

clf_save()

Saves the model's classifier to <root_dir>/<behav_name>.sav.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def clf_save(self):
    """
    Saves the model's classifier to `<root_dir>/<behav_name>.sav`.
    """
    joblib.dump(self.clf, self.clf_fp)

combine_dfs()

Combines the data into a single X df, y df, and index. The indexes of x and y will be the same (with an inner join)

Returns:

Name Type Description
x DataFrame

Features dataframe of all experiments in the x directory

y DataFrame

Outcomes dataframe of all experiments in the y directory

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def combine_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Combines the data into a single `X` df, `y` df, and index.
    The indexes of `x` and `y` will be the same (with an inner join)

    Returns
    -------
    x :
        Features dataframe of all experiments in the `x` directory
    y :
        Outcomes dataframe of all experiments in the `y` directory
    """
    # Getting the x and y dfs
    x = BehavClassifier.combine(os.path.join(self.root_dir, X_ID))
    y = BehavClassifier.combine(os.path.join(self.root_dir, Y_ID))
    # Getting the intersection pf the x and y row indexes
    index = x.index.intersection(y.index)
    x = x.loc[index]
    y = y.loc[index]
    # Assert that x and y are the same length
    assert x.shape[0] == y.shape[0]
    # Returning the x and y dfs
    return x, y

create_from_model(root_dir, behaviour_name)

Creating a new BehavClassifier model in the given directory

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def create_from_model(self, root_dir: str, behaviour_name: str) -> BehavClassifier:
    """
    Creating a new BehavClassifier model in the given directory
    """
    configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
    # Making new BehavClassifier instance
    inst = self.create_new_model(configs_fp, behaviour_name)
    # Using current instance's configs (but using given behaviour_name)
    configs = self.configs
    configs.behaviour_name = behaviour_name
    configs.write_json(inst.configs_fp)
    # Returning model
    return inst

create_from_project(proj) classmethod

Loading classifier from given Project instance.

Parameters:

Name Type Description Default
proj Project

The Project instance.

required

Returns:

Type Description
list[BehavClassifier]

The loaded BehavClassifier instance.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@classmethod
def create_from_project(cls, proj: Project) -> list[BehavClassifier]:
    """
    Loading classifier from given Project instance.

    Parameters
    ----------
    proj :
        The Project instance.

    Returns
    -------
    :
        The loaded BehavClassifier instance.
    """
    # Getting the list of behaviours
    y_df = cls.wrangle_columns_y(
        cls.combine(os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value))
    )
    # For each behaviour, making a new BehavClassifier instance
    behavs_ls = y_df.columns.to_list()
    models_dir = os.path.join(proj.root_dir, BEHAV_MODELS_SUBDIR)
    models_ls = [cls.create_new_model(models_dir, behav) for behav in behavs_ls]
    # Importing data from project to "beham_models" folder (only need one model for this)
    if len(models_ls) > 0:
        models_ls[0].import_data(
            os.path.join(proj.root_dir, Folders.FEATURES_EXTRACTED.value),
            os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value),
            False,
        )
    return models_ls

create_new_model(root_dir, behaviour_name) classmethod

Creating a new BehavClassifier model in the given directory

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@classmethod
def create_new_model(cls, root_dir: str, behaviour_name: str) -> BehavClassifier:
    """
    Creating a new BehavClassifier model in the given directory
    """
    configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
    # Making new BehavClassifier instance
    inst = cls(configs_fp)
    # Updating configs with project data
    configs = inst.configs
    configs.behaviour_name = behaviour_name
    configs.write_json(inst.configs_fp)
    # Returning model
    return inst

eval_bouts(y_true, y_pred) staticmethod

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def eval_bouts(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame:
    """
    __summary__
    """
    y_eval = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})
    y_eval["ids"] = np.cumsum(y_eval["y_true"] != y_eval["y_true"].shift())
    # Getting the proportion of correct predictions for each bout
    y_eval_grouped = y_eval.groupby("ids")
    y_eval_summary = pd.DataFrame(
        y_eval_grouped.apply(lambda x: (x["y_pred"] == x["y_true"]).mean()),
        columns=["proportion"],
    )
    y_eval_summary["actual_bout"] = y_eval_grouped.apply(
        lambda x: x["y_true"].mean()
    )
    y_eval_summary["bout_len"] = y_eval_grouped.apply(lambda x: x.shape[0])
    y_eval_summary = y_eval_summary.sort_values("proportion")
    # # Making figure
    # fig, ax = plt.subplots(figsize=(10, 7))
    # sns.scatterplot(
    #     data=y_eval_summary,
    #     x="proportion",
    #     y="bout_len",
    #     hue="actual_bout",
    #     alpha=0.4,
    #     marker=".",
    #     s=50,
    #     linewidth=0,
    #     ax=ax,
    # )
    return y_eval_summary

eval_conf_matr(y_true, y_pred) staticmethod

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def eval_conf_matr(y_true: pd.Series, y_pred: pd.Series) -> plt.Figure:
    """
    __summary__
    """
    # Making confusion matrix
    fig, ax = plt.subplots(figsize=(7, 7))
    sns.heatmap(
        confusion_matrix(y_true, y_pred),
        annot=True,
        fmt="d",
        cmap="viridis",
        cbar=False,
        xticklabels=GENERIC_BEHAV_LABELS,
        yticklabels=GENERIC_BEHAV_LABELS,
        ax=ax,
    )
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    return fig

eval_logc(y_true, y_prob) staticmethod

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def eval_logc(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
    """
    __summary__
    """
    y_eval = pd.DataFrame(
        {
            "y_true": y_true,
            "y_prob": y_prob,
            "y_pred": y_prob > 0.4,
            "y_true_jitter": y_true + (0.2 * (np.random.rand(len(y_prob)) - 0.5)),
        }
    )
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.scatterplot(
        data=y_eval,
        x="y_prob",
        y="y_true_jitter",
        marker=".",
        s=10,
        linewidth=0,
        alpha=0.2,
        ax=ax,
    )
    # Making line of ratio of y_true outcomes for each y_prob
    pcutoffs = np.linspace(0, 1, 101)
    ratios = np.vectorize(lambda i: np.mean(i > y_eval["y_prob"]))(pcutoffs)
    sns.lineplot(x=pcutoffs, y=ratios, ax=ax)
    # Returning figure
    return fig

eval_metrics_pcutoffs(y_true, y_prob) staticmethod

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def eval_metrics_pcutoffs(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
    """
    __summary__
    """
    # Getting precision, recall and accuracy for different cutoffs
    pcutoffs = np.linspace(0, 1, 101)
    # Measures
    precisions = np.zeros(pcutoffs.shape[0])
    recalls = np.zeros(pcutoffs.shape[0])
    f1 = np.zeros(pcutoffs.shape[0])
    accuracies = np.zeros(pcutoffs.shape[0])
    for i, pcutoff in enumerate(pcutoffs):
        y_pred = y_prob > pcutoff
        report = classification_report(
            y_true,
            y_pred,
            target_names=GENERIC_BEHAV_LABELS,
            output_dict=True,
        )
        precisions[i] = report[GENERIC_BEHAV_LABELS[1]]["precision"]
        recalls[i] = report[GENERIC_BEHAV_LABELS[1]]["recall"]
        f1[i] = report[GENERIC_BEHAV_LABELS[1]]["f1-score"]
        accuracies[i] = report["accuracy"]
    # Making figure
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.lineplot(x=pcutoffs, y=precisions, label="precision", ax=ax)
    sns.lineplot(x=pcutoffs, y=recalls, label="recall", ax=ax)
    sns.lineplot(x=pcutoffs, y=f1, label="f1", ax=ax)
    sns.lineplot(x=pcutoffs, y=accuracies, label="accuracy", ax=ax)
    return fig

eval_report(y_true, y_pred) staticmethod

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def eval_report(y_true: pd.Series, y_pred: pd.Series) -> dict:
    """
    __summary__
    """
    return classification_report(
        y_true,
        y_pred,
        target_names=GENERIC_BEHAV_LABELS,
        output_dict=True,
    )

import_data(x_dir, y_dir, overwrite=False)

Importing data from extracted features and labelled behaviours dataframes.

Parameters:

Name Type Description Default
x_dir str

description

required
y_dir str

description

required
Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def import_data(self, x_dir: str, y_dir: str, overwrite=False) -> None:
    """
    Importing data from extracted features and labelled behaviours dataframes.

    Parameters
    ----------
    x_dir :
        _description_
    y_dir :
        _description_
    """
    # For each x and y directory
    for in_dir, id in ((x_dir, X_ID), (y_dir, Y_ID)):
        out_dir = os.path.join(self.root_dir, id)
        os.makedirs(out_dir, exist_ok=True)
        # Copying each file to model root directory
        for fp in os.listdir(in_dir):
            in_fp = os.path.join(in_dir, fp)
            out_fp = os.path.join(out_dir, fp)
            # If not overwriting and out file already exists, then skip
            if not overwrite and os.path.exists(out_fp):
                continue
            # Copying file
            shutil.copyfile(in_fp, out_fp)

load(configs_fp) classmethod

Reads the model from the expected model file.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@classmethod
def load(cls, configs_fp: str) -> BehavClassifier:
    """
    Reads the model from the expected model file.
    """
    if not os.path.isfile(configs_fp):
        raise FileNotFoundError(f"The model file does not exist: {configs_fp}")
    return cls(configs_fp)

pipeline_build(clf_init_f)

Makes a classifier and saves it to the model's root directory.

Callable is a method from ClfTemplates.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def pipeline_build(self, clf_init_f: Callable) -> None:
    """
    Makes a classifier and saves it to the model's root directory.

    Callable is a method from `ClfTemplates`.
    """
    # Preparing data
    x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
    # Initialising the model
    self.clf = clf_init_f()
    # Training the model
    history = self.clf.fit(
        x=x,
        y=y,
        index=ind_train,
        batch_size=self.configs.batch_size,
        epochs=self.configs.epochs,
        val_split=self.configs.val_split,
    )
    # Saving history
    self.clf_eval_save_history(history)
    # Evaluating the model
    self.clf_eval(x, y, ind_test)
    # Updating the model configs
    configs = self.configs
    configs.clf_structure = clf_init_f.__name__
    configs.write_json(self.configs_fp)
    # Saving the model to disk
    self.clf_save()

pipeline_run(x)

Given the unprocessed features dataframe, runs the model pipeline to make predictions.

Pipeline is: - Preprocess x df. Refer to behavysis_pipeline.behav_classifier.BehavClassifier.preproc_x for details. - Makes predictions and returns the predicted behaviours.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def pipeline_run(self, x: pd.DataFrame) -> pd.DataFrame:
    """
    Given the unprocessed features dataframe, runs the model pipeline to make predictions.

    Pipeline is:
    - Preprocess `x` df. Refer to
    [behavysis_pipeline.behav_classifier.BehavClassifier.preproc_x][] for details.
    - Makes predictions and returns the predicted behaviours.
    """
    # Saving index for later
    index = x.index
    # Preprocessing features
    x = self.prepare_data(x)
    # Loading the model
    self.clf_load()
    # Making predictions
    y_eval = self.clf_predict(x, self.configs.batch_size)
    # Settings the index
    y_eval.index = index
    # Returning predictions
    return y_eval

prepare_data(x)

Prepares novel (x only) data, given the x pd.DataFrame.

Performs the following: - Preprocesses x df. Refer to preprocess_x for details. - Makes the X windowed array, for each index.

Returns:

Name Type Description
x ndarray

Features array in the format: (samples, window, features)

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def prepare_data(self, x: pd.DataFrame) -> np.ndarray:
    """
    Prepares novel (`x` only) data, given the `x` pd.DataFrame.

    Performs the following:
    - Preprocesses x df. Refer to `preprocess_x` for details.
    - Makes the X windowed array, for each index.

    Returns
    -------
    x : np.ndarray
        Features array in the format: `(samples, window, features)`
    """
    # Preprocessing x df
    x = self.preproc_x(x, self.preproc_fp)
    # Returning x
    return x

prepare_data_training()

Prepares the data (x and y) in the model for training. Data is taken from the model's x and y dirs.

Performs the following: - Combining dfs from x and y directories (individual experiment data) - Ensures the x and y dfs have the same index, and are in the same row order - Preprocesses x df. Refer to preprocess_x for details. - Selects the y class (given in the configs file) from the y df. - Preprocesses y df. Refer to preprocess_y for details.

Returns:

Name Type Description
x ndarray

Features array in the format: (samples, window, features)

y ndarray

Outcomes array in the format: (samples, class)

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def prepare_data_training(self) -> tuple[np.ndarray, np.ndarray]:
    """
    Prepares the data (`x` and `y`) in the model for training.
    Data is taken from the model's `x` and `y` dirs.

    Performs the following:
    - Combining dfs from x and y directories (individual experiment data)
    - Ensures the x and y dfs have the same index, and are in the same row order
    - Preprocesses x df. Refer to `preprocess_x` for details.
    - Selects the y class (given in the configs file) from the y df.
    - Preprocesses y df. Refer to `preprocess_y` for details.

    Returns
    -------
    x : np.ndarray
        Features array in the format: `(samples, window, features)`
    y : np.ndarray
        Outcomes array in the format: `(samples, class)`
    """
    # Combining dfs from x and y directories (individual experiment data)
    x, y = self.combine_dfs()
    # Fitting the preprocessor pipeline
    self.preproc_x_fit(x, self.preproc_fp)
    # Preprocessing x df
    x = self.preproc_x(x, self.preproc_fp)
    # Preprocessing y df
    y = self.wrangle_columns_y(y)[self.configs.behaviour_name].values
    y = self.preproc_y(y)
    # Returning x, y, and index to use
    return x, y

prepare_data_training_pipeline()

Prepares the data for the training pipeline.

Performs the following: - Preprocesses x and y data. Refer to prepare_data_training for details. - Splits into training and test indexes. - The training indexes are undersampled to the ratio given in the configs.

Returns: A tuple containing four numpy arrays: - x: The input data. - y: The target labels. - ind_train: The indexes for the training data. - ind_test: The indexes for the testing data.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
def prepare_data_training_pipeline(
    self,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Prepares the data for the training pipeline.

    Performs the following:
    - Preprocesses `x` and `y` data. Refer to `prepare_data_training` for details.
    - Splits into training and test indexes.
        - The training indexes are undersampled to the ratio given in the configs.

    Returns:
        A tuple containing four numpy arrays:
        - x: The input data.
        - y: The target labels.
        - ind_train: The indexes for the training data.
        - ind_test: The indexes for the testing data.
    """
    # Preparing data
    x, y = self.prepare_data_training()
    # Getting entire index
    index = np.arange(x.shape[0])
    # Splitting into train and test indexes
    ind_train, ind_test = train_test_split(
        index,
        test_size=self.configs.test_split,
        stratify=y[index],
    )
    # Undersampling training index
    ind_train = self.undersample(
        ind_train, y[ind_train], self.configs.undersample_ratio
    )
    # Return
    return x, y, ind_train, ind_test

preproc_x(x, preproc_fp) staticmethod

The preprocessing steps are: - MinMax scaling (using previously fitted MinMaxScaler)

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def preproc_x(x: np.ndarray, preproc_fp: str) -> np.ndarray:
    """
    The preprocessing steps are:
    - MinMax scaling (using previously fitted MinMaxScaler)
    """
    # Loading in pipeline
    preproc_pipe = joblib.load(preproc_fp)
    # Uses trained fit for preprocessing new data
    x = preproc_pipe.transform(x)
    # Returning df
    return x

preproc_x_fit(x, preproc_fp) staticmethod

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def preproc_x_fit(x: np.ndarray, preproc_fp: str) -> None:
    """
    __summary__
    """
    # Making pipeline
    preproc_pipe = Pipeline(steps=[("MinMaxScaler", MinMaxScaler())])
    # Fitting pipeline
    preproc_pipe.fit(x)
    # Saving pipeline
    joblib.dump(preproc_pipe, preproc_fp)

preproc_y(y) staticmethod

The preprocessing steps are: - Imputing NaN values with 0 - Setting -1 to 0 - Converting the MultiIndex columns from (behav, outcome) to {behav}__{outcome}, by expanding the actual and all specific outcome columns of each behav.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def preproc_y(y: np.ndarray) -> np.ndarray:
    """
    The preprocessing steps are:
    - Imputing NaN values with 0
    - Setting -1 to 0
    - Converting the MultiIndex columns from `(behav, outcome)` to `{behav}__{outcome}`,
    by expanding the `actual` and all specific outcome columns of each behav.
    """
    # Imputing NaN values with 0
    y = np.nan_to_num(y, nan=0)
    # Setting -1 to 0 (i.e. "undecided" to "no behaviour")
    y = np.maximum(y, 0)
    # Returning arr
    return y

wrangle_columns_y(y) staticmethod

summary

Parameters:

Name Type Description Default
y DataFrame

description

required

Returns:

Type Description
DataFrame

description

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
@staticmethod
def wrangle_columns_y(y: pd.DataFrame) -> pd.DataFrame:
    """
    _summary_

    Parameters
    ----------
    y :
        _description_

    Returns
    -------
    :
        _description_
    """
    # Filtering out the prob and pred columns (in the `outcomes` level)
    cols_filter = np.isin(
        y.columns.get_level_values(BehavCN.OUTCOMES.value),
        [BehavColumns.PROB.value, BehavColumns.PRED.value],
        invert=True,
    )
    y = y.loc[:, cols_filter]
    # Converting MultiIndex columns to single columns by
    # setting the column names from `(behav, outcome)` to `{behav}__{outcome}`
    y.columns = [
        f"{i[0]}" if i[1] == BehavColumns.ACTUAL.value else f"{i[0]}__{i[1]}"
        for i in y.columns
    ]
    return y

behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifierConfigs

Bases: PydanticBaseModel

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py
class BehavClassifierConfigs(PydanticBaseModel):
    """_summary_"""

    model_config = ConfigDict(extra="forbid")

    behaviour_name: str = "BehaviourName"
    seed: int = 42
    undersample_ratio: float = 0.1

    clf_structure: str = "clf"  # Classifier type (defined in ClfTemplates)
    pcutoff: float = 0.5
    test_split: float = 0.2
    val_split: float = 0.2
    batch_size: int = 256
    epochs: int = 50