Behav classifier

`behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifier` ¶

BehavClassifier abstract class peforms behav classifier model preparation, training, saving, evaluation, and inference.

Attributes:

Name	Type	Description
`configs_fp`	`str`	description
`clf`	`BaseTorchModel`	description

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

class BehavClassifier:
    """
    BehavClassifier abstract class peforms behav classifier model preparation, training, saving,
    evaluation, and inference.

    Attributes
    ----------
    configs_fp
        _description_
    clf
        _description_
    """

    configs_fp: str
    clf: BaseTorchModel

    def __init__(self, configs_fp: str) -> None:
        """
        Make a BehavClassifier instance.

        Parameters
        ----------
        configs_fp :
            _description_
        """
        # Storing configs json fp
        self.configs_fp = configs_fp
        self.clf = None
        # Trying to read in configs json. Making a new one if it doesn't exist
        try:
            configs = BehavClassifierConfigs.read_json(self.configs_fp)
            logging.info("Reading existing model configs")
        except FileNotFoundError:
            configs = BehavClassifierConfigs()
            logging.info("Making new model configs")
        # Saving configs
        configs.write_json(self.configs_fp)

    #################################################
    # CREATE MODEL METHODS
    #################################################

    @classmethod
    def create_from_project(cls, proj: Project) -> list[BehavClassifier]:
        """
        Loading classifier from given Project instance.

        Parameters
        ----------
        proj :
            The Project instance.

        Returns
        -------
        :
            The loaded BehavClassifier instance.
        """
        # Getting the list of behaviours
        y_df = cls.wrangle_columns_y(
            cls.combine(os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value))
        )
        # For each behaviour, making a new BehavClassifier instance
        behavs_ls = y_df.columns.to_list()
        models_dir = os.path.join(proj.root_dir, BEHAV_MODELS_SUBDIR)
        models_ls = [cls.create_new_model(models_dir, behav) for behav in behavs_ls]
        # Importing data from project to "beham_models" folder (only need one model for this)
        if len(models_ls) > 0:
            models_ls[0].import_data(
                os.path.join(proj.root_dir, Folders.FEATURES_EXTRACTED.value),
                os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value),
                False,
            )
        return models_ls

    @classmethod
    def create_new_model(cls, root_dir: str, behaviour_name: str) -> BehavClassifier:
        """
        Creating a new BehavClassifier model in the given directory
        """
        configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
        # Making new BehavClassifier instance
        inst = cls(configs_fp)
        # Updating configs with project data
        configs = inst.configs
        configs.behaviour_name = behaviour_name
        configs.write_json(inst.configs_fp)
        # Returning model
        return inst

    def create_from_model(self, root_dir: str, behaviour_name: str) -> BehavClassifier:
        """
        Creating a new BehavClassifier model in the given directory
        """
        configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
        # Making new BehavClassifier instance
        inst = self.create_new_model(configs_fp, behaviour_name)
        # Using current instance's configs (but using given behaviour_name)
        configs = self.configs
        configs.behaviour_name = behaviour_name
        configs.write_json(inst.configs_fp)
        # Returning model
        return inst

    #################################################
    #            READING MODEL
    #################################################

    @classmethod
    def load(cls, configs_fp: str) -> BehavClassifier:
        """
        Reads the model from the expected model file.
        """
        if not os.path.isfile(configs_fp):
            raise FileNotFoundError(f"The model file does not exist: {configs_fp}")
        return cls(configs_fp)

    #################################################
    #            GETTER AND SETTERS
    #################################################

    @property
    def configs(self) -> BehavClassifierConfigs:
        """Returns the config model from the expected config file."""
        return BehavClassifierConfigs.read_json(self.configs_fp)

    @property
    def root_dir(self) -> str:
        """Returns the model's root directory"""
        return os.path.dirname(self.configs_fp)

    @property
    def clf_fp(self) -> str:
        """Returns the model's filepath"""
        path = os.path.join(self.root_dir, self.configs.behaviour_name, "model.sav")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path

    @property
    def preproc_fp(self) -> str:
        """Returns the model's preprocessor filepath"""
        path = os.path.join(self.root_dir, self.configs.behaviour_name, "preproc.sav")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path

    @property
    def eval_dir(self) -> str:
        """Returns the model's evaluation directory"""
        path = os.path.join(self.root_dir, self.configs.behaviour_name, "eval")
        os.makedirs(path, exist_ok=True)
        return path

    #################################################
    #            IMPORTING DATA TO MODEL
    #################################################

    def import_data(self, x_dir: str, y_dir: str, overwrite=False) -> None:
        """
        Importing data from extracted features and labelled behaviours dataframes.

        Parameters
        ----------
        x_dir :
            _description_
        y_dir :
            _description_
        """
        # For each x and y directory
        for in_dir, id in ((x_dir, X_ID), (y_dir, Y_ID)):
            out_dir = os.path.join(self.root_dir, id)
            os.makedirs(out_dir, exist_ok=True)
            # Copying each file to model root directory
            for fp in os.listdir(in_dir):
                in_fp = os.path.join(in_dir, fp)
                out_fp = os.path.join(out_dir, fp)
                # If not overwriting and out file already exists, then skip
                if not overwrite and os.path.exists(out_fp):
                    continue
                # Copying file
                shutil.copyfile(in_fp, out_fp)

    #################################################
    #            COMBINING DFS TO SINGLE DF
    #################################################

    @staticmethod
    def combine(src_dir):
        data_dict = {
            os.path.splitext(i)[0]: pd.read_feather(os.path.join(src_dir, i))
            for i in os.listdir(os.path.join(src_dir))
        }
        return pd.concat(data_dict.values(), axis=0, keys=data_dict.keys())

    def combine_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Combines the data into a single `X` df, `y` df, and index.
        The indexes of `x` and `y` will be the same (with an inner join)

        Returns
        -------
        x :
            Features dataframe of all experiments in the `x` directory
        y :
            Outcomes dataframe of all experiments in the `y` directory
        """
        # Getting the x and y dfs
        x = BehavClassifier.combine(os.path.join(self.root_dir, X_ID))
        y = BehavClassifier.combine(os.path.join(self.root_dir, Y_ID))
        # Getting the intersection pf the x and y row indexes
        index = x.index.intersection(y.index)
        x = x.loc[index]
        y = y.loc[index]
        # Assert that x and y are the same length
        assert x.shape[0] == y.shape[0]
        # Returning the x and y dfs
        return x, y

    #################################################
    #            PREPROCESSING DFS
    #################################################

    @staticmethod
    def preproc_x_fit(x: np.ndarray, preproc_fp: str) -> None:
        """
        __summary__
        """
        # Making pipeline
        preproc_pipe = Pipeline(steps=[("MinMaxScaler", MinMaxScaler())])
        # Fitting pipeline
        preproc_pipe.fit(x)
        # Saving pipeline
        joblib.dump(preproc_pipe, preproc_fp)

    @staticmethod
    def preproc_x(x: np.ndarray, preproc_fp: str) -> np.ndarray:
        """
        The preprocessing steps are:
        - MinMax scaling (using previously fitted MinMaxScaler)
        """
        # Loading in pipeline
        preproc_pipe = joblib.load(preproc_fp)
        # Uses trained fit for preprocessing new data
        x = preproc_pipe.transform(x)
        # Returning df
        return x

    @staticmethod
    def wrangle_columns_y(y: pd.DataFrame) -> pd.DataFrame:
        """
        _summary_

        Parameters
        ----------
        y :
            _description_

        Returns
        -------
        :
            _description_
        """
        # Filtering out the prob and pred columns (in the `outcomes` level)
        cols_filter = np.isin(
            y.columns.get_level_values(BehavCN.OUTCOMES.value),
            [BehavColumns.PROB.value, BehavColumns.PRED.value],
            invert=True,
        )
        y = y.loc[:, cols_filter]
        # Converting MultiIndex columns to single columns by
        # setting the column names from `(behav, outcome)` to `{behav}__{outcome}`
        y.columns = [
            f"{i[0]}" if i[1] == BehavColumns.ACTUAL.value else f"{i[0]}__{i[1]}"
            for i in y.columns
        ]
        return y

    @staticmethod
    def preproc_y(y: np.ndarray) -> np.ndarray:
        """
        The preprocessing steps are:
        - Imputing NaN values with 0
        - Setting -1 to 0
        - Converting the MultiIndex columns from `(behav, outcome)` to `{behav}__{outcome}`,
        by expanding the `actual` and all specific outcome columns of each behav.
        """
        # Imputing NaN values with 0
        y = np.nan_to_num(y, nan=0)
        # Setting -1 to 0 (i.e. "undecided" to "no behaviour")
        y = np.maximum(y, 0)
        # Returning arr
        return y

    @staticmethod
    def undersample(index: np.ndarray, y: np.ndarray, ratio: float) -> np.ndarray:
        # Assert that index and y are the same length
        assert index.shape[0] == y.shape[0]
        # Getting array of True indices
        t = index[y == 1]
        # Getting array of False indices
        f = index[y == 0]
        # Undersampling the False indices
        f = np.random.choice(f, size=int(t.shape[0] / ratio), replace=False)
        # Combining the True and False indices
        uindex = np.union1d(t, f)
        # Returning the undersampled index
        return uindex

    #################################################
    #            PIPELINE FOR DATA PREP
    #################################################

    def prepare_data_training(self) -> tuple[np.ndarray, np.ndarray]:
        """
        Prepares the data (`x` and `y`) in the model for training.
        Data is taken from the model's `x` and `y` dirs.

        Performs the following:
        - Combining dfs from x and y directories (individual experiment data)
        - Ensures the x and y dfs have the same index, and are in the same row order
        - Preprocesses x df. Refer to `preprocess_x` for details.
        - Selects the y class (given in the configs file) from the y df.
        - Preprocesses y df. Refer to `preprocess_y` for details.

        Returns
        -------
        x : np.ndarray
            Features array in the format: `(samples, window, features)`
        y : np.ndarray
            Outcomes array in the format: `(samples, class)`
        """
        # Combining dfs from x and y directories (individual experiment data)
        x, y = self.combine_dfs()
        # Fitting the preprocessor pipeline
        self.preproc_x_fit(x, self.preproc_fp)
        # Preprocessing x df
        x = self.preproc_x(x, self.preproc_fp)
        # Preprocessing y df
        y = self.wrangle_columns_y(y)[self.configs.behaviour_name].values
        y = self.preproc_y(y)
        # Returning x, y, and index to use
        return x, y

    def prepare_data_training_pipeline(
        self,
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepares the data for the training pipeline.

        Performs the following:
        - Preprocesses `x` and `y` data. Refer to `prepare_data_training` for details.
        - Splits into training and test indexes.
            - The training indexes are undersampled to the ratio given in the configs.

        Returns:
            A tuple containing four numpy arrays:
            - x: The input data.
            - y: The target labels.
            - ind_train: The indexes for the training data.
            - ind_test: The indexes for the testing data.
        """
        # Preparing data
        x, y = self.prepare_data_training()
        # Getting entire index
        index = np.arange(x.shape[0])
        # Splitting into train and test indexes
        ind_train, ind_test = train_test_split(
            index,
            test_size=self.configs.test_split,
            stratify=y[index],
        )
        # Undersampling training index
        ind_train = self.undersample(
            ind_train, y[ind_train], self.configs.undersample_ratio
        )
        # Return
        return x, y, ind_train, ind_test

    def prepare_data(self, x: pd.DataFrame) -> np.ndarray:
        """
        Prepares novel (`x` only) data, given the `x` pd.DataFrame.

        Performs the following:
        - Preprocesses x df. Refer to `preprocess_x` for details.
        - Makes the X windowed array, for each index.

        Returns
        -------
        x : np.ndarray
            Features array in the format: `(samples, window, features)`
        """
        # Preprocessing x df
        x = self.preproc_x(x, self.preproc_fp)
        # Returning x
        return x

    #################################################
    # PIPELINE FOR CLASSIFIER TRAINING AND INFERENCE
    #################################################

    def pipeline_build(self, clf_init_f: Callable) -> None:
        """
        Makes a classifier and saves it to the model's root directory.

        Callable is a method from `ClfTemplates`.
        """
        # Preparing data
        x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
        # Initialising the model
        self.clf = clf_init_f()
        # Training the model
        history = self.clf.fit(
            x=x,
            y=y,
            index=ind_train,
            batch_size=self.configs.batch_size,
            epochs=self.configs.epochs,
            val_split=self.configs.val_split,
        )
        # Saving history
        self.clf_eval_save_history(history)
        # Evaluating the model
        self.clf_eval(x, y, ind_test)
        # Updating the model configs
        configs = self.configs
        configs.clf_structure = clf_init_f.__name__
        configs.write_json(self.configs_fp)
        # Saving the model to disk
        self.clf_save()

    def pipeline_run(self, x: pd.DataFrame) -> pd.DataFrame:
        """
        Given the unprocessed features dataframe, runs the model pipeline to make predictions.

        Pipeline is:
        - Preprocess `x` df. Refer to
        [behavysis_pipeline.behav_classifier.BehavClassifier.preproc_x][] for details.
        - Makes predictions and returns the predicted behaviours.
        """
        # Saving index for later
        index = x.index
        # Preprocessing features
        x = self.prepare_data(x)
        # Loading the model
        self.clf_load()
        # Making predictions
        y_eval = self.clf_predict(x, self.configs.batch_size)
        # Settings the index
        y_eval.index = index
        # Returning predictions
        return y_eval

    #################################################
    # MODEL CLASSIFIER METHODS
    #################################################

    def clf_load(self):
        """
        Loads the model stored in `<root_dir>/<behav_name>.sav` to the model attribute.
        """
        self.clf = joblib.load(self.clf_fp)

    def clf_save(self):
        """
        Saves the model's classifier to `<root_dir>/<behav_name>.sav`.
        """
        joblib.dump(self.clf, self.clf_fp)

    def clf_predict(
        self,
        x: np.ndarray,
        batch_size: int,
        index: Optional[np.ndarray] = None,
    ) -> pd.DataFrame:
        """
        Making predictions using the given model and preprocessed features.
        Assumes the x array is already preprocessed.

        Parameters
        ----------
        x : np.ndarray
            Preprocessed features.

        Returns
        -------
        pd.DataFrame
            Predicted behaviour classifications. Dataframe columns are in the format:
            ```
            behaviours :  behav    behav
            outcomes   :  "prob"   "pred"
            ```
        """
        # Getting probabilities
        index = np.arange(x.shape[0]) if index is None else index
        y_probs = self.clf.predict(
            x=x,
            index=index,
            batch_size=batch_size,
        )
        # Making predictions from probabilities (and pcutoff)
        y_preds = (y_probs > self.configs.pcutoff).astype(int)
        # Making df
        pred_df = BehavMixin.init_df(index)
        pred_df[(self.configs.behaviour_name, BehavColumns.PROB.value)] = y_probs
        pred_df[(self.configs.behaviour_name, BehavColumns.PRED.value)] = y_preds
        # Returning predicted behavs
        return pred_df

    #################################################
    # COMPREHENSIVE EVALUATION FUNCTIONS
    #################################################

    def clf_eval_save_history(self, history: pd.DataFrame, name: Optional[str] = ""):
        # Saving history df
        DFIOMixin.write_feather(
            history, os.path.join(self.eval_dir, f"{name}_history.feather")
        )
        # Making and saving history figure
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.lineplot(data=history, ax=ax)
        fig.savefig(os.path.join(self.eval_dir, f"{name}_history.png"))

    def clf_eval(
        self,
        x: np.ndarray,
        y: np.ndarray,
        index: Optional[np.ndarray] = None,
        name: Optional[str] = "",
    ) -> tuple[pd.DataFrame, dict, plt.Figure, plt.Figure, plt.Figure]:
        """
        Evaluates the classifier performance on the given x and y data.
        Saves the `metrics_fig` and `pcutoffs_fig` to the model's root directory.

        Returns
        -------
        y_eval : pd.DataFrame
            Predicted behaviour classifications against the true labels.
        metrics_fig : mpl.Figure
            Figure showing the confusion matrix.
        pcutoffs_fig : mpl.Figure
            Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
        logc_fig : mpl.Figure
            Figure showing the logistic curve for different predicted probabilities.
        """
        # Making eval df
        index = np.arange(x.shape[0]) if index is None else index
        y_eval = self.clf_predict(x=x, index=index, batch_size=self.configs.batch_size)
        # Including `actual` lables in `y_eval`
        y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value] = y[index]
        # Getting individual columns
        y_prob = y_eval[self.configs.behaviour_name, BehavColumns.PROB.value]
        y_pred = y_eval[self.configs.behaviour_name, BehavColumns.PRED.value]
        y_true = y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value]
        # Making classification report
        report_dict = self.eval_report(y_true, y_pred)
        # Making confusion matrix figure
        metrics_fig = self.eval_conf_matr(y_true, y_pred)
        # Making performance for different pcutoffs figure
        pcutoffs_fig = self.eval_metrics_pcutoffs(y_true, y_prob)
        # Logistic curve
        logc_fig = self.eval_logc(y_true, y_prob)
        # Saving data and figures
        DFIOMixin.write_feather(
            y_eval, os.path.join(self.eval_dir, f"{name}_eval.feather")
        )
        with open(os.path.join(self.eval_dir, f"{name}_report.json"), "w") as f:
            json.dump(report_dict, f)
        metrics_fig.savefig(os.path.join(self.eval_dir, f"{name}_confm.png"))
        pcutoffs_fig.savefig(os.path.join(self.eval_dir, f"{name}_pcutoffs.png"))
        logc_fig.savefig(os.path.join(self.eval_dir, f"{name}_logc.png"))
        # Print classification report
        print(json.dumps(report_dict, indent=4))
        # Return evaluations
        return y_eval, report_dict, metrics_fig, pcutoffs_fig, logc_fig

    def clf_eval_compare_all(self):
        """
        Making classifier for all available templates.

        Notes
        -----
        Takes a long time to run.
        """
        # Saving existing clf
        clf = self.clf
        # Preparing data
        x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
        # # Adding noise (TODO: use with augmentation)
        # noise = 0.05
        # x_train += np.random.normal(0, noise, x_train.shape)
        # x_test += np.random.normal(0, noise, x_test.shape)
        # Getting eval for each classifier in ClfTemplates
        for clf_init_f in CLF_TEMPLATES:
            clf_name = clf_init_f.__name__
            # Making classifier
            self.clf = clf_init_f()
            # Training
            history = self.clf.fit(
                x=x,
                y=y,
                index=ind_train,
                batch_size=self.configs.batch_size,
                epochs=self.configs.epochs,
                val_split=self.configs.val_split,
            )
            # Saving history
            self.clf_eval_save_history(history, name=clf_name)
            # Evaluating on train and test data
            self.clf_eval(x, y, index=ind_train, name=f"{clf_name}_train")
            self.clf_eval(x, y, index=ind_test, name=f"{clf_name}_test")
        # Restoring clf
        self.clf = clf

    #################################################
    # EVALUATION METRICS FUNCTIONS
    #################################################

    @staticmethod
    def eval_report(y_true: pd.Series, y_pred: pd.Series) -> dict:
        """
        __summary__
        """
        return classification_report(
            y_true,
            y_pred,
            target_names=GENERIC_BEHAV_LABELS,
            output_dict=True,
        )

    @staticmethod
    def eval_conf_matr(y_true: pd.Series, y_pred: pd.Series) -> plt.Figure:
        """
        __summary__
        """
        # Making confusion matrix
        fig, ax = plt.subplots(figsize=(7, 7))
        sns.heatmap(
            confusion_matrix(y_true, y_pred),
            annot=True,
            fmt="d",
            cmap="viridis",
            cbar=False,
            xticklabels=GENERIC_BEHAV_LABELS,
            yticklabels=GENERIC_BEHAV_LABELS,
            ax=ax,
        )
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        return fig

    @staticmethod
    def eval_metrics_pcutoffs(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
        """
        __summary__
        """
        # Getting precision, recall and accuracy for different cutoffs
        pcutoffs = np.linspace(0, 1, 101)
        # Measures
        precisions = np.zeros(pcutoffs.shape[0])
        recalls = np.zeros(pcutoffs.shape[0])
        f1 = np.zeros(pcutoffs.shape[0])
        accuracies = np.zeros(pcutoffs.shape[0])
        for i, pcutoff in enumerate(pcutoffs):
            y_pred = y_prob > pcutoff
            report = classification_report(
                y_true,
                y_pred,
                target_names=GENERIC_BEHAV_LABELS,
                output_dict=True,
            )
            precisions[i] = report[GENERIC_BEHAV_LABELS[1]]["precision"]
            recalls[i] = report[GENERIC_BEHAV_LABELS[1]]["recall"]
            f1[i] = report[GENERIC_BEHAV_LABELS[1]]["f1-score"]
            accuracies[i] = report["accuracy"]
        # Making figure
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.lineplot(x=pcutoffs, y=precisions, label="precision", ax=ax)
        sns.lineplot(x=pcutoffs, y=recalls, label="recall", ax=ax)
        sns.lineplot(x=pcutoffs, y=f1, label="f1", ax=ax)
        sns.lineplot(x=pcutoffs, y=accuracies, label="accuracy", ax=ax)
        return fig

    @staticmethod
    def eval_logc(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
        """
        __summary__
        """
        y_eval = pd.DataFrame(
            {
                "y_true": y_true,
                "y_prob": y_prob,
                "y_pred": y_prob > 0.4,
                "y_true_jitter": y_true + (0.2 * (np.random.rand(len(y_prob)) - 0.5)),
            }
        )
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.scatterplot(
            data=y_eval,
            x="y_prob",
            y="y_true_jitter",
            marker=".",
            s=10,
            linewidth=0,
            alpha=0.2,
            ax=ax,
        )
        # Making line of ratio of y_true outcomes for each y_prob
        pcutoffs = np.linspace(0, 1, 101)
        ratios = np.vectorize(lambda i: np.mean(i > y_eval["y_prob"]))(pcutoffs)
        sns.lineplot(x=pcutoffs, y=ratios, ax=ax)
        # Returning figure
        return fig

    @staticmethod
    def eval_bouts(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame:
        """
        __summary__
        """
        y_eval = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})
        y_eval["ids"] = np.cumsum(y_eval["y_true"] != y_eval["y_true"].shift())
        # Getting the proportion of correct predictions for each bout
        y_eval_grouped = y_eval.groupby("ids")
        y_eval_summary = pd.DataFrame(
            y_eval_grouped.apply(lambda x: (x["y_pred"] == x["y_true"]).mean()),
            columns=["proportion"],
        )
        y_eval_summary["actual_bout"] = y_eval_grouped.apply(
            lambda x: x["y_true"].mean()
        )
        y_eval_summary["bout_len"] = y_eval_grouped.apply(lambda x: x.shape[0])
        y_eval_summary = y_eval_summary.sort_values("proportion")
        # # Making figure
        # fig, ax = plt.subplots(figsize=(10, 7))
        # sns.scatterplot(
        #     data=y_eval_summary,
        #     x="proportion",
        #     y="bout_len",
        #     hue="actual_bout",
        #     alpha=0.4,
        #     marker=".",
        #     s=50,
        #     linewidth=0,
        #     ax=ax,
        # )
        return y_eval_summary

`clf_fp: str` `property` ¶

Returns the model's filepath

`configs: BehavClassifierConfigs` `property` ¶

Returns the config model from the expected config file.

`eval_dir: str` `property` ¶

Returns the model's evaluation directory

`preproc_fp: str` `property` ¶

Returns the model's preprocessor filepath

`root_dir: str` `property` ¶

Returns the model's root directory

`init(configs_fp)` ¶

Make a BehavClassifier instance.

Parameters:

Name	Type	Description	Default
`configs_fp`	`str`	description	required

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def __init__(self, configs_fp: str) -> None:
    """
    Make a BehavClassifier instance.

    Parameters
    ----------
    configs_fp :
        _description_
    """
    # Storing configs json fp
    self.configs_fp = configs_fp
    self.clf = None
    # Trying to read in configs json. Making a new one if it doesn't exist
    try:
        configs = BehavClassifierConfigs.read_json(self.configs_fp)
        logging.info("Reading existing model configs")
    except FileNotFoundError:
        configs = BehavClassifierConfigs()
        logging.info("Making new model configs")
    # Saving configs
    configs.write_json(self.configs_fp)

`clf_eval(x, y, index=None, name='')` ¶

Evaluates the classifier performance on the given x and y data. Saves the metrics_fig and pcutoffs_fig to the model's root directory.

Returns:

Name	Type	Description
`y_eval`	`DataFrame`	Predicted behaviour classifications against the true labels.
`metrics_fig`	`Figure`	Figure showing the confusion matrix.
`pcutoffs_fig`	`Figure`	Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
`logc_fig`	`Figure`	Figure showing the logistic curve for different predicted probabilities.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def clf_eval(
    self,
    x: np.ndarray,
    y: np.ndarray,
    index: Optional[np.ndarray] = None,
    name: Optional[str] = "",
) -> tuple[pd.DataFrame, dict, plt.Figure, plt.Figure, plt.Figure]:
    """
    Evaluates the classifier performance on the given x and y data.
    Saves the `metrics_fig` and `pcutoffs_fig` to the model's root directory.

    Returns
    -------
    y_eval : pd.DataFrame
        Predicted behaviour classifications against the true labels.
    metrics_fig : mpl.Figure
        Figure showing the confusion matrix.
    pcutoffs_fig : mpl.Figure
        Figure showing the precision, recall, f1, and accuracy for different pcutoffs.
    logc_fig : mpl.Figure
        Figure showing the logistic curve for different predicted probabilities.
    """
    # Making eval df
    index = np.arange(x.shape[0]) if index is None else index
    y_eval = self.clf_predict(x=x, index=index, batch_size=self.configs.batch_size)
    # Including `actual` lables in `y_eval`
    y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value] = y[index]
    # Getting individual columns
    y_prob = y_eval[self.configs.behaviour_name, BehavColumns.PROB.value]
    y_pred = y_eval[self.configs.behaviour_name, BehavColumns.PRED.value]
    y_true = y_eval[self.configs.behaviour_name, BehavColumns.ACTUAL.value]
    # Making classification report
    report_dict = self.eval_report(y_true, y_pred)
    # Making confusion matrix figure
    metrics_fig = self.eval_conf_matr(y_true, y_pred)
    # Making performance for different pcutoffs figure
    pcutoffs_fig = self.eval_metrics_pcutoffs(y_true, y_prob)
    # Logistic curve
    logc_fig = self.eval_logc(y_true, y_prob)
    # Saving data and figures
    DFIOMixin.write_feather(
        y_eval, os.path.join(self.eval_dir, f"{name}_eval.feather")
    )
    with open(os.path.join(self.eval_dir, f"{name}_report.json"), "w") as f:
        json.dump(report_dict, f)
    metrics_fig.savefig(os.path.join(self.eval_dir, f"{name}_confm.png"))
    pcutoffs_fig.savefig(os.path.join(self.eval_dir, f"{name}_pcutoffs.png"))
    logc_fig.savefig(os.path.join(self.eval_dir, f"{name}_logc.png"))
    # Print classification report
    print(json.dumps(report_dict, indent=4))
    # Return evaluations
    return y_eval, report_dict, metrics_fig, pcutoffs_fig, logc_fig

`clf_eval_compare_all()` ¶

Making classifier for all available templates.

Notes

Takes a long time to run.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def clf_eval_compare_all(self):
    """
    Making classifier for all available templates.

    Notes
    -----
    Takes a long time to run.
    """
    # Saving existing clf
    clf = self.clf
    # Preparing data
    x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
    # # Adding noise (TODO: use with augmentation)
    # noise = 0.05
    # x_train += np.random.normal(0, noise, x_train.shape)
    # x_test += np.random.normal(0, noise, x_test.shape)
    # Getting eval for each classifier in ClfTemplates
    for clf_init_f in CLF_TEMPLATES:
        clf_name = clf_init_f.__name__
        # Making classifier
        self.clf = clf_init_f()
        # Training
        history = self.clf.fit(
            x=x,
            y=y,
            index=ind_train,
            batch_size=self.configs.batch_size,
            epochs=self.configs.epochs,
            val_split=self.configs.val_split,
        )
        # Saving history
        self.clf_eval_save_history(history, name=clf_name)
        # Evaluating on train and test data
        self.clf_eval(x, y, index=ind_train, name=f"{clf_name}_train")
        self.clf_eval(x, y, index=ind_test, name=f"{clf_name}_test")
    # Restoring clf
    self.clf = clf

`clf_load()` ¶

Loads the model stored in <root_dir>/<behav_name>.sav to the model attribute.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def clf_load(self):
    """
    Loads the model stored in `<root_dir>/<behav_name>.sav` to the model attribute.
    """
    self.clf = joblib.load(self.clf_fp)

`clf_predict(x, batch_size, index=None)` ¶

Making predictions using the given model and preprocessed features. Assumes the x array is already preprocessed.

Parameters:

Name	Type	Description	Default
`x`	`ndarray`	Preprocessed features.	required

Returns:

Type	Description
`DataFrame`	Predicted behaviour classifications. Dataframe columns are in the format: `behaviours : behav behav outcomes : "prob" "pred"`

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def clf_predict(
    self,
    x: np.ndarray,
    batch_size: int,
    index: Optional[np.ndarray] = None,
) -> pd.DataFrame:
    """
    Making predictions using the given model and preprocessed features.
    Assumes the x array is already preprocessed.

    Parameters
    ----------
    x : np.ndarray
        Preprocessed features.

    Returns
    -------
    pd.DataFrame
        Predicted behaviour classifications. Dataframe columns are in the format:
        ```
        behaviours :  behav    behav
        outcomes   :  "prob"   "pred"
        ```
    """
    # Getting probabilities
    index = np.arange(x.shape[0]) if index is None else index
    y_probs = self.clf.predict(
        x=x,
        index=index,
        batch_size=batch_size,
    )
    # Making predictions from probabilities (and pcutoff)
    y_preds = (y_probs > self.configs.pcutoff).astype(int)
    # Making df
    pred_df = BehavMixin.init_df(index)
    pred_df[(self.configs.behaviour_name, BehavColumns.PROB.value)] = y_probs
    pred_df[(self.configs.behaviour_name, BehavColumns.PRED.value)] = y_preds
    # Returning predicted behavs
    return pred_df

`clf_save()` ¶

Saves the model's classifier to <root_dir>/<behav_name>.sav.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def clf_save(self):
    """
    Saves the model's classifier to `<root_dir>/<behav_name>.sav`.
    """
    joblib.dump(self.clf, self.clf_fp)

`combine_dfs()` ¶

Combines the data into a single X df, y df, and index. The indexes of x and y will be the same (with an inner join)

Returns:

Name	Type	Description
`x`	`DataFrame`	Features dataframe of all experiments in the `x` directory
`y`	`DataFrame`	Outcomes dataframe of all experiments in the `y` directory

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def combine_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Combines the data into a single `X` df, `y` df, and index.
    The indexes of `x` and `y` will be the same (with an inner join)

    Returns
    -------
    x :
        Features dataframe of all experiments in the `x` directory
    y :
        Outcomes dataframe of all experiments in the `y` directory
    """
    # Getting the x and y dfs
    x = BehavClassifier.combine(os.path.join(self.root_dir, X_ID))
    y = BehavClassifier.combine(os.path.join(self.root_dir, Y_ID))
    # Getting the intersection pf the x and y row indexes
    index = x.index.intersection(y.index)
    x = x.loc[index]
    y = y.loc[index]
    # Assert that x and y are the same length
    assert x.shape[0] == y.shape[0]
    # Returning the x and y dfs
    return x, y

`create_from_model(root_dir, behaviour_name)` ¶

Creating a new BehavClassifier model in the given directory

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def create_from_model(self, root_dir: str, behaviour_name: str) -> BehavClassifier:
    """
    Creating a new BehavClassifier model in the given directory
    """
    configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
    # Making new BehavClassifier instance
    inst = self.create_new_model(configs_fp, behaviour_name)
    # Using current instance's configs (but using given behaviour_name)
    configs = self.configs
    configs.behaviour_name = behaviour_name
    configs.write_json(inst.configs_fp)
    # Returning model
    return inst

`create_from_project(proj)` `classmethod` ¶

Loading classifier from given Project instance.

Parameters:

Name	Type	Description	Default
`proj`	`Project`	The Project instance.	required

Returns:

Type	Description
`list[BehavClassifier]`	The loaded BehavClassifier instance.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@classmethod
def create_from_project(cls, proj: Project) -> list[BehavClassifier]:
    """
    Loading classifier from given Project instance.

    Parameters
    ----------
    proj :
        The Project instance.

    Returns
    -------
    :
        The loaded BehavClassifier instance.
    """
    # Getting the list of behaviours
    y_df = cls.wrangle_columns_y(
        cls.combine(os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value))
    )
    # For each behaviour, making a new BehavClassifier instance
    behavs_ls = y_df.columns.to_list()
    models_dir = os.path.join(proj.root_dir, BEHAV_MODELS_SUBDIR)
    models_ls = [cls.create_new_model(models_dir, behav) for behav in behavs_ls]
    # Importing data from project to "beham_models" folder (only need one model for this)
    if len(models_ls) > 0:
        models_ls[0].import_data(
            os.path.join(proj.root_dir, Folders.FEATURES_EXTRACTED.value),
            os.path.join(proj.root_dir, Folders.SCORED_BEHAVS.value),
            False,
        )
    return models_ls

`create_new_model(root_dir, behaviour_name)` `classmethod` ¶

Creating a new BehavClassifier model in the given directory

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@classmethod
def create_new_model(cls, root_dir: str, behaviour_name: str) -> BehavClassifier:
    """
    Creating a new BehavClassifier model in the given directory
    """
    configs_fp = os.path.join(root_dir, f"{behaviour_name}.json")
    # Making new BehavClassifier instance
    inst = cls(configs_fp)
    # Updating configs with project data
    configs = inst.configs
    configs.behaviour_name = behaviour_name
    configs.write_json(inst.configs_fp)
    # Returning model
    return inst

`eval_bouts(y_true, y_pred)` `staticmethod` ¶

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def eval_bouts(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame:
    """
    __summary__
    """
    y_eval = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})
    y_eval["ids"] = np.cumsum(y_eval["y_true"] != y_eval["y_true"].shift())
    # Getting the proportion of correct predictions for each bout
    y_eval_grouped = y_eval.groupby("ids")
    y_eval_summary = pd.DataFrame(
        y_eval_grouped.apply(lambda x: (x["y_pred"] == x["y_true"]).mean()),
        columns=["proportion"],
    )
    y_eval_summary["actual_bout"] = y_eval_grouped.apply(
        lambda x: x["y_true"].mean()
    )
    y_eval_summary["bout_len"] = y_eval_grouped.apply(lambda x: x.shape[0])
    y_eval_summary = y_eval_summary.sort_values("proportion")
    # # Making figure
    # fig, ax = plt.subplots(figsize=(10, 7))
    # sns.scatterplot(
    #     data=y_eval_summary,
    #     x="proportion",
    #     y="bout_len",
    #     hue="actual_bout",
    #     alpha=0.4,
    #     marker=".",
    #     s=50,
    #     linewidth=0,
    #     ax=ax,
    # )
    return y_eval_summary

`eval_conf_matr(y_true, y_pred)` `staticmethod` ¶

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def eval_conf_matr(y_true: pd.Series, y_pred: pd.Series) -> plt.Figure:
    """
    __summary__
    """
    # Making confusion matrix
    fig, ax = plt.subplots(figsize=(7, 7))
    sns.heatmap(
        confusion_matrix(y_true, y_pred),
        annot=True,
        fmt="d",
        cmap="viridis",
        cbar=False,
        xticklabels=GENERIC_BEHAV_LABELS,
        yticklabels=GENERIC_BEHAV_LABELS,
        ax=ax,
    )
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    return fig

`eval_logc(y_true, y_prob)` `staticmethod` ¶

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def eval_logc(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
    """
    __summary__
    """
    y_eval = pd.DataFrame(
        {
            "y_true": y_true,
            "y_prob": y_prob,
            "y_pred": y_prob > 0.4,
            "y_true_jitter": y_true + (0.2 * (np.random.rand(len(y_prob)) - 0.5)),
        }
    )
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.scatterplot(
        data=y_eval,
        x="y_prob",
        y="y_true_jitter",
        marker=".",
        s=10,
        linewidth=0,
        alpha=0.2,
        ax=ax,
    )
    # Making line of ratio of y_true outcomes for each y_prob
    pcutoffs = np.linspace(0, 1, 101)
    ratios = np.vectorize(lambda i: np.mean(i > y_eval["y_prob"]))(pcutoffs)
    sns.lineplot(x=pcutoffs, y=ratios, ax=ax)
    # Returning figure
    return fig

`eval_metrics_pcutoffs(y_true, y_prob)` `staticmethod` ¶

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def eval_metrics_pcutoffs(y_true: pd.Series, y_prob: pd.Series) -> plt.Figure:
    """
    __summary__
    """
    # Getting precision, recall and accuracy for different cutoffs
    pcutoffs = np.linspace(0, 1, 101)
    # Measures
    precisions = np.zeros(pcutoffs.shape[0])
    recalls = np.zeros(pcutoffs.shape[0])
    f1 = np.zeros(pcutoffs.shape[0])
    accuracies = np.zeros(pcutoffs.shape[0])
    for i, pcutoff in enumerate(pcutoffs):
        y_pred = y_prob > pcutoff
        report = classification_report(
            y_true,
            y_pred,
            target_names=GENERIC_BEHAV_LABELS,
            output_dict=True,
        )
        precisions[i] = report[GENERIC_BEHAV_LABELS[1]]["precision"]
        recalls[i] = report[GENERIC_BEHAV_LABELS[1]]["recall"]
        f1[i] = report[GENERIC_BEHAV_LABELS[1]]["f1-score"]
        accuracies[i] = report["accuracy"]
    # Making figure
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.lineplot(x=pcutoffs, y=precisions, label="precision", ax=ax)
    sns.lineplot(x=pcutoffs, y=recalls, label="recall", ax=ax)
    sns.lineplot(x=pcutoffs, y=f1, label="f1", ax=ax)
    sns.lineplot(x=pcutoffs, y=accuracies, label="accuracy", ax=ax)
    return fig

`eval_report(y_true, y_pred)` `staticmethod` ¶

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def eval_report(y_true: pd.Series, y_pred: pd.Series) -> dict:
    """
    __summary__
    """
    return classification_report(
        y_true,
        y_pred,
        target_names=GENERIC_BEHAV_LABELS,
        output_dict=True,
    )

`import_data(x_dir, y_dir, overwrite=False)` ¶

Importing data from extracted features and labelled behaviours dataframes.

Parameters:

Name	Type	Description	Default
`x_dir`	`str`	description	required
`y_dir`	`str`	description	required

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def import_data(self, x_dir: str, y_dir: str, overwrite=False) -> None:
    """
    Importing data from extracted features and labelled behaviours dataframes.

    Parameters
    ----------
    x_dir :
        _description_
    y_dir :
        _description_
    """
    # For each x and y directory
    for in_dir, id in ((x_dir, X_ID), (y_dir, Y_ID)):
        out_dir = os.path.join(self.root_dir, id)
        os.makedirs(out_dir, exist_ok=True)
        # Copying each file to model root directory
        for fp in os.listdir(in_dir):
            in_fp = os.path.join(in_dir, fp)
            out_fp = os.path.join(out_dir, fp)
            # If not overwriting and out file already exists, then skip
            if not overwrite and os.path.exists(out_fp):
                continue
            # Copying file
            shutil.copyfile(in_fp, out_fp)

`load(configs_fp)` `classmethod` ¶

Reads the model from the expected model file.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@classmethod
def load(cls, configs_fp: str) -> BehavClassifier:
    """
    Reads the model from the expected model file.
    """
    if not os.path.isfile(configs_fp):
        raise FileNotFoundError(f"The model file does not exist: {configs_fp}")
    return cls(configs_fp)

`pipeline_build(clf_init_f)` ¶

Makes a classifier and saves it to the model's root directory.

Callable is a method from ClfTemplates.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def pipeline_build(self, clf_init_f: Callable) -> None:
    """
    Makes a classifier and saves it to the model's root directory.

    Callable is a method from `ClfTemplates`.
    """
    # Preparing data
    x, y, ind_train, ind_test = self.prepare_data_training_pipeline()
    # Initialising the model
    self.clf = clf_init_f()
    # Training the model
    history = self.clf.fit(
        x=x,
        y=y,
        index=ind_train,
        batch_size=self.configs.batch_size,
        epochs=self.configs.epochs,
        val_split=self.configs.val_split,
    )
    # Saving history
    self.clf_eval_save_history(history)
    # Evaluating the model
    self.clf_eval(x, y, ind_test)
    # Updating the model configs
    configs = self.configs
    configs.clf_structure = clf_init_f.__name__
    configs.write_json(self.configs_fp)
    # Saving the model to disk
    self.clf_save()

`pipeline_run(x)` ¶

Given the unprocessed features dataframe, runs the model pipeline to make predictions.

Pipeline is: - Preprocess x df. Refer to behavysis_pipeline.behav_classifier.BehavClassifier.preproc_x for details. - Makes predictions and returns the predicted behaviours.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def pipeline_run(self, x: pd.DataFrame) -> pd.DataFrame:
    """
    Given the unprocessed features dataframe, runs the model pipeline to make predictions.

    Pipeline is:
    - Preprocess `x` df. Refer to
    [behavysis_pipeline.behav_classifier.BehavClassifier.preproc_x][] for details.
    - Makes predictions and returns the predicted behaviours.
    """
    # Saving index for later
    index = x.index
    # Preprocessing features
    x = self.prepare_data(x)
    # Loading the model
    self.clf_load()
    # Making predictions
    y_eval = self.clf_predict(x, self.configs.batch_size)
    # Settings the index
    y_eval.index = index
    # Returning predictions
    return y_eval

`prepare_data(x)` ¶

Prepares novel (x only) data, given the x pd.DataFrame.

Performs the following: - Preprocesses x df. Refer to preprocess_x for details. - Makes the X windowed array, for each index.

Returns:

Name	Type	Description
`x`	`ndarray`	Features array in the format: `(samples, window, features)`

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def prepare_data(self, x: pd.DataFrame) -> np.ndarray:
    """
    Prepares novel (`x` only) data, given the `x` pd.DataFrame.

    Performs the following:
    - Preprocesses x df. Refer to `preprocess_x` for details.
    - Makes the X windowed array, for each index.

    Returns
    -------
    x : np.ndarray
        Features array in the format: `(samples, window, features)`
    """
    # Preprocessing x df
    x = self.preproc_x(x, self.preproc_fp)
    # Returning x
    return x

`prepare_data_training()` ¶

Prepares the data (x and y) in the model for training. Data is taken from the model's x and y dirs.

Performs the following: - Combining dfs from x and y directories (individual experiment data) - Ensures the x and y dfs have the same index, and are in the same row order - Preprocesses x df. Refer to preprocess_x for details. - Selects the y class (given in the configs file) from the y df. - Preprocesses y df. Refer to preprocess_y for details.

Returns:

Name	Type	Description
`x`	`ndarray`	Features array in the format: `(samples, window, features)`
`y`	`ndarray`	Outcomes array in the format: `(samples, class)`

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def prepare_data_training(self) -> tuple[np.ndarray, np.ndarray]:
    """
    Prepares the data (`x` and `y`) in the model for training.
    Data is taken from the model's `x` and `y` dirs.

    Performs the following:
    - Combining dfs from x and y directories (individual experiment data)
    - Ensures the x and y dfs have the same index, and are in the same row order
    - Preprocesses x df. Refer to `preprocess_x` for details.
    - Selects the y class (given in the configs file) from the y df.
    - Preprocesses y df. Refer to `preprocess_y` for details.

    Returns
    -------
    x : np.ndarray
        Features array in the format: `(samples, window, features)`
    y : np.ndarray
        Outcomes array in the format: `(samples, class)`
    """
    # Combining dfs from x and y directories (individual experiment data)
    x, y = self.combine_dfs()
    # Fitting the preprocessor pipeline
    self.preproc_x_fit(x, self.preproc_fp)
    # Preprocessing x df
    x = self.preproc_x(x, self.preproc_fp)
    # Preprocessing y df
    y = self.wrangle_columns_y(y)[self.configs.behaviour_name].values
    y = self.preproc_y(y)
    # Returning x, y, and index to use
    return x, y

`prepare_data_training_pipeline()` ¶

Prepares the data for the training pipeline.

Performs the following: - Preprocesses x and y data. Refer to prepare_data_training for details. - Splits into training and test indexes. - The training indexes are undersampled to the ratio given in the configs.

Returns: A tuple containing four numpy arrays: - x: The input data. - y: The target labels. - ind_train: The indexes for the training data. - ind_test: The indexes for the testing data.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

def prepare_data_training_pipeline(
    self,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Prepares the data for the training pipeline.

    Performs the following:
    - Preprocesses `x` and `y` data. Refer to `prepare_data_training` for details.
    - Splits into training and test indexes.
        - The training indexes are undersampled to the ratio given in the configs.

    Returns:
        A tuple containing four numpy arrays:
        - x: The input data.
        - y: The target labels.
        - ind_train: The indexes for the training data.
        - ind_test: The indexes for the testing data.
    """
    # Preparing data
    x, y = self.prepare_data_training()
    # Getting entire index
    index = np.arange(x.shape[0])
    # Splitting into train and test indexes
    ind_train, ind_test = train_test_split(
        index,
        test_size=self.configs.test_split,
        stratify=y[index],
    )
    # Undersampling training index
    ind_train = self.undersample(
        ind_train, y[ind_train], self.configs.undersample_ratio
    )
    # Return
    return x, y, ind_train, ind_test

`preproc_x(x, preproc_fp)` `staticmethod` ¶

The preprocessing steps are: - MinMax scaling (using previously fitted MinMaxScaler)

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def preproc_x(x: np.ndarray, preproc_fp: str) -> np.ndarray:
    """
    The preprocessing steps are:
    - MinMax scaling (using previously fitted MinMaxScaler)
    """
    # Loading in pipeline
    preproc_pipe = joblib.load(preproc_fp)
    # Uses trained fit for preprocessing new data
    x = preproc_pipe.transform(x)
    # Returning df
    return x

`preproc_x_fit(x, preproc_fp)` `staticmethod` ¶

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def preproc_x_fit(x: np.ndarray, preproc_fp: str) -> None:
    """
    __summary__
    """
    # Making pipeline
    preproc_pipe = Pipeline(steps=[("MinMaxScaler", MinMaxScaler())])
    # Fitting pipeline
    preproc_pipe.fit(x)
    # Saving pipeline
    joblib.dump(preproc_pipe, preproc_fp)

`preproc_y(y)` `staticmethod` ¶

The preprocessing steps are: - Imputing NaN values with 0 - Setting -1 to 0 - Converting the MultiIndex columns from (behav, outcome) to {behav}__{outcome}, by expanding the actual and all specific outcome columns of each behav.

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def preproc_y(y: np.ndarray) -> np.ndarray:
    """
    The preprocessing steps are:
    - Imputing NaN values with 0
    - Setting -1 to 0
    - Converting the MultiIndex columns from `(behav, outcome)` to `{behav}__{outcome}`,
    by expanding the `actual` and all specific outcome columns of each behav.
    """
    # Imputing NaN values with 0
    y = np.nan_to_num(y, nan=0)
    # Setting -1 to 0 (i.e. "undecided" to "no behaviour")
    y = np.maximum(y, 0)
    # Returning arr
    return y

`wrangle_columns_y(y)` `staticmethod` ¶

summary

Parameters:

Name	Type	Description	Default
`y`	`DataFrame`	description	required

Returns:

Type	Description
`DataFrame`	description

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

@staticmethod
def wrangle_columns_y(y: pd.DataFrame) -> pd.DataFrame:
    """
    _summary_

    Parameters
    ----------
    y :
        _description_

    Returns
    -------
    :
        _description_
    """
    # Filtering out the prob and pred columns (in the `outcomes` level)
    cols_filter = np.isin(
        y.columns.get_level_values(BehavCN.OUTCOMES.value),
        [BehavColumns.PROB.value, BehavColumns.PRED.value],
        invert=True,
    )
    y = y.loc[:, cols_filter]
    # Converting MultiIndex columns to single columns by
    # setting the column names from `(behav, outcome)` to `{behav}__{outcome}`
    y.columns = [
        f"{i[0]}" if i[1] == BehavColumns.ACTUAL.value else f"{i[0]}__{i[1]}"
        for i in y.columns
    ]
    return y

`behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifierConfigs` ¶

Bases: PydanticBaseModel

summary

Source code in behavysis_pipeline/behav_classifier/behav_classifier.py

class BehavClassifierConfigs(PydanticBaseModel):
    """_summary_"""

    model_config = ConfigDict(extra="forbid")

    behaviour_name: str = "BehaviourName"
    seed: int = 42
    undersample_ratio: float = 0.1

    clf_structure: str = "clf"  # Classifier type (defined in ClfTemplates)
    pcutoff: float = 0.5
    test_split: float = 0.2
    val_split: float = 0.2
    batch_size: int = 256
    epochs: int = 50

Behav classifier

behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifier ¶

clf_fp: str property ¶

configs: BehavClassifierConfigs property ¶

eval_dir: str property ¶

preproc_fp: str property ¶

root_dir: str property ¶

__init__(configs_fp) ¶

clf_eval(x, y, index=None, name='') ¶

clf_eval_compare_all() ¶

clf_load() ¶

clf_predict(x, batch_size, index=None) ¶

clf_save() ¶

combine_dfs() ¶

create_from_model(root_dir, behaviour_name) ¶

create_from_project(proj) classmethod ¶

create_new_model(root_dir, behaviour_name) classmethod ¶

eval_bouts(y_true, y_pred) staticmethod ¶

eval_conf_matr(y_true, y_pred) staticmethod ¶

eval_logc(y_true, y_prob) staticmethod ¶

eval_metrics_pcutoffs(y_true, y_prob) staticmethod ¶

eval_report(y_true, y_pred) staticmethod ¶

import_data(x_dir, y_dir, overwrite=False) ¶

load(configs_fp) classmethod ¶

pipeline_build(clf_init_f) ¶

pipeline_run(x) ¶

prepare_data(x) ¶

prepare_data_training() ¶

prepare_data_training_pipeline() ¶

preproc_x(x, preproc_fp) staticmethod ¶

preproc_x_fit(x, preproc_fp) staticmethod ¶

preproc_y(y) staticmethod ¶

wrangle_columns_y(y) staticmethod ¶

behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifierConfigs ¶

`behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifier` ¶

`clf_fp: str` `property` ¶

`configs: BehavClassifierConfigs` `property` ¶

`eval_dir: str` `property` ¶

`preproc_fp: str` `property` ¶

`root_dir: str` `property` ¶

`init(configs_fp)` ¶

`clf_eval(x, y, index=None, name='')` ¶

`clf_eval_compare_all()` ¶

`clf_load()` ¶

`clf_predict(x, batch_size, index=None)` ¶

`clf_save()` ¶

`combine_dfs()` ¶

`create_from_model(root_dir, behaviour_name)` ¶

`create_from_project(proj)` `classmethod` ¶

`create_new_model(root_dir, behaviour_name)` `classmethod` ¶

`eval_bouts(y_true, y_pred)` `staticmethod` ¶

`eval_conf_matr(y_true, y_pred)` `staticmethod` ¶

`eval_logc(y_true, y_prob)` `staticmethod` ¶

`eval_metrics_pcutoffs(y_true, y_prob)` `staticmethod` ¶

`eval_report(y_true, y_pred)` `staticmethod` ¶

`import_data(x_dir, y_dir, overwrite=False)` ¶

`load(configs_fp)` `classmethod` ¶

`pipeline_build(clf_init_f)` ¶

`pipeline_run(x)` ¶

`prepare_data(x)` ¶

`prepare_data_training()` ¶

`prepare_data_training_pipeline()` ¶

`preproc_x(x, preproc_fp)` `staticmethod` ¶

`preproc_x_fit(x, preproc_fp)` `staticmethod` ¶

`preproc_y(y)` `staticmethod` ¶

`wrangle_columns_y(y)` `staticmethod` ¶

`behavysis_pipeline.behav_classifier.behav_classifier.BehavClassifierConfigs` ¶