Module `recsys.experiments`

Expand source code Browse git

import itertools
import logging
from typing import List, Tuple

import pandas as pd
from tqdm.notebook import tqdm

from recsys import config
from recsys.dataset import build_dataset, get_training_set_from_dataset, \
    get_test_set_from_dataset, set_future_features, get_submission_set_from_dataset
from recsys.dataset_loader import get_dataset_and_dataloader, batches_to_device
from recsys.encoders import DatasetEncoder
from recsys.model import BookingNet
from recsys.model_train import train_model_for_folds
from recsys.paths import get_model_ckpt_paths, get_path
from recsys.types import *
from recsys.utils import get_distribution_by_pos, get_submission, accuracy_at_k, print_gpu_usage
from recsys.utils import get_ground_truth_from_dataset, get_trained_models


def run_experiments(base_configuration: Dict,
                    experiments: List[Dict],
                    n_models: int,
                    dataset_batches: List[BatchType],
                    train_set: pd.DataFrame,
                    skip_checkpoint=False) -> None:
    """
    Given a base configuration in a dictionary, run experiments
    by overriding parameters of this base configuration with
    a list of overrides in `experiments`.
    """
    for model_overrides in tqdm(experiments):
        logging.info(model_overrides)
        model_configuration = dict(base_configuration, **model_overrides)
        train_model_for_folds(dataset_batches,
                              train_set,
                              model_configuration,
                              n_models=n_models,
                              skip_checkpoint=skip_checkpoint)


def get_base_configuration():
    """
    The base configuration describes our best model. Experiments
    change elements of this configuration to try to find an even
    better one.
    """
    return {
        'features_embedding': config.FEATURES_EMBEDDING,
        'hidden_size': int(config.EMBEDDING_SIZES['city_id'][1]),
        'output_size': int(config.EMBEDDING_SIZES['city_id'][0]),
        'embedding_sizes': config.EMBEDDING_SIZES,
        'n_layers': 2,
        'dropout': 0.3,
        'rnn_dropout': 0.1,
        'tie_embedding_and_projection': True,
        'model_type': ModelType.MANY_TO_MANY,
        'recurrent_type': RecurrentType.GRU,
        'weight_type': WeightType.UNWEIGHTED,
        'feature_projection_type': FeatureProjectionType.CONCATENATION,
        'num_folds': config.N_SPLITS,
        'batch_size': config.BATCH_SIZE
    }


def get_experiments() -> List:
    """
    An experiment is a dict that describes the parameters
    that will be overridden in the base configuration
    during an experiment.
    """
    params = ['model_type', 'weight_type', 'recurrent_type', 'tie_embedding_and_projection']
    return [
        dict(zip(params, p))
        for p in itertools.product(
            *map(list, [ModelType, WeightType, RecurrentType, [True, False]])
        )
    ]


def get_model_performance_data(test_set: pd.DataFrame,
                               dataset_encoder: DatasetEncoder,
                               model_hashes: List[str] = None) -> Tuple[pd.DataFrame, Dict]:
    """
    Get model performance data from all trained models.
    """
    booking_dataset_test, dataset_loader_test = get_dataset_and_dataloader(
        df=test_set,
        features=config.FEATURES_EMBEDDING
    )
    ground_truth_test = get_ground_truth_from_dataset(
        df=test_set,
        booking_dataset=booking_dataset_test,
        dataset_encoder=dataset_encoder
    )

    trained_models = get_trained_models()

    if model_hashes:
        trained_models = {h: trained_models[h] for h in model_hashes}

    df_rows = []
    accuracy_at_4_by_length = {}
    for model_hash, model_parameters in trained_models.items():
        try:
            ckpt_list = get_model_ckpt_paths(model_hash=model_hash,
                                             checkpoint_type='accuracy_at_k')
        except FileNotFoundError:
            continue

        d = {
            'single': ckpt_list[:1],
            'ensemble': ckpt_list
        }

        for model_type, ckpt_list in d.items():
            if model_type == 'ensemble' and len(ckpt_list) == 1:
                continue

            model = BookingNet(**model_parameters).to(config.DEVICE)
            predictions = get_submission(booking_dataset_test,
                                         dataset_loader_test,
                                         model,
                                         ckpt_list,
                                         dataset_encoder)
            accuracy = accuracy_at_k(predictions, ground_truth_test)
            model_parameters['num_models'] = len(ckpt_list)
            model_parameters['accuracy@1'] = accuracy['accuracy@1']
            model_parameters['accuracy@4'] = accuracy['accuracy@4']
            model_parameters['accuracy@10'] = accuracy['accuracy@10']
            model_parameters['hash'] = model_hash
            df = pd.DataFrame.from_dict(model_parameters, orient='index').T
            df_rows.append(pd.concat(
                [df, pd.DataFrame.from_dict(accuracy['accuracy@4_by_pos'], orient='index').T]
                , axis=1))
            accuracy_at_4_by_length[(model_hash, model_type)] = accuracy['accuracy@4_by_pos']
    return pd.concat(df_rows), accuracy_at_4_by_length


def filter_results_table(results: pd.DataFrame) -> pd.DataFrame:
    """
    Filter results table to get only attributes that change between models.
    """
    columns = ['model_type', 'recurrent_type', 'tie_embedding_and_projection',
               'weight_type', 'accuracy@1', 'accuracy@4', 'accuracy@10', 'hash']
    selected_columns = [col for col in results.columns.values
                        if results[col].apply(str).nunique() > 1
                        or col in columns]
    filtered_results = (results[selected_columns]
                        .sort_values("accuracy@4", ascending=False))

    decode = {
        'model_type': ModelType,
        'weight_type': WeightType,
        'recurrent_type': RecurrentType
    }

    for key, enum_type in decode.items():
        filtered_results[key] = (filtered_results[key]
                                 .apply(enum_type)
                                 .apply(lambda s: str(s).split('.')[1]))

    df_table = filtered_results[columns].sort_values(
        ["model_type", "recurrent_type", "tie_embedding_and_projection", "accuracy@4"],
        ascending=[True, True, False, False])
    return df_table


if __name__ == "__main__":
    # build and encode dataset
    dataset = build_dataset(reserved_obs=30000)
    de = DatasetEncoder(config.FEATURES_TO_ENCODE)
    de.fit_transform(dataset)
    set_future_features(dataset)

    submission_set = get_submission_set_from_dataset(dataset)

    # keep only observations before the last visit
    dataset = dataset[~dataset.next_city_id.isna()]

    # split training and test set from dataset
    train_set = get_training_set_from_dataset(dataset)
    test_set = get_test_set_from_dataset(dataset)

    logging.info(f"Training set: {train_set.shape}")
    logging.info(f"Test set: {test_set.shape}")
    logging.info(f"Dataset: {dataset.shape}")

    logging.info(get_distribution_by_pos(dataset=dataset,
                                         train_set=train_set[train_set.train == 1],
                                         test_set=test_set,
                                         submission=submission_set).head(10))

    _, dataset_loader = get_dataset_and_dataloader(
        train_set,
        features=config.FEATURES_EMBEDDING + ['next_city_id'],
        batch_size=config.BATCH_SIZE
    )
    dataset_batches_cuda = batches_to_device(dataset_loader)

    print_gpu_usage(0)

    # run experiments from base configuration
    base_configuration = get_base_configuration()
    experiments = get_experiments()
    run_experiments(base_configuration=base_configuration,
                    experiments=experiments,
                    n_models=1,
                    dataset_batches=dataset_batches_cuda,
                    train_set=train_set)

    # get and save results table
    results, _ = get_model_performance_data(test_set, de)

    filter_results_table(results).to_csv(
        get_path(
            filename='experiments',
            format='csv'),
        index=False
    )

Functions

def filter_results_table(results: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame

Filter results table to get only attributes that change between models.

Expand source code Browse git

def filter_results_table(results: pd.DataFrame) -> pd.DataFrame:
    """
    Filter results table to get only attributes that change between models.
    """
    columns = ['model_type', 'recurrent_type', 'tie_embedding_and_projection',
               'weight_type', 'accuracy@1', 'accuracy@4', 'accuracy@10', 'hash']
    selected_columns = [col for col in results.columns.values
                        if results[col].apply(str).nunique() > 1
                        or col in columns]
    filtered_results = (results[selected_columns]
                        .sort_values("accuracy@4", ascending=False))

    decode = {
        'model_type': ModelType,
        'weight_type': WeightType,
        'recurrent_type': RecurrentType
    }

    for key, enum_type in decode.items():
        filtered_results[key] = (filtered_results[key]
                                 .apply(enum_type)
                                 .apply(lambda s: str(s).split('.')[1]))

    df_table = filtered_results[columns].sort_values(
        ["model_type", "recurrent_type", "tie_embedding_and_projection", "accuracy@4"],
        ascending=[True, True, False, False])
    return df_table

def get_base_configuration()

The base configuration describes our best model. Experiments change elements of this configuration to try to find an even better one.

Expand source code Browse git

def get_base_configuration():
    """
    The base configuration describes our best model. Experiments
    change elements of this configuration to try to find an even
    better one.
    """
    return {
        'features_embedding': config.FEATURES_EMBEDDING,
        'hidden_size': int(config.EMBEDDING_SIZES['city_id'][1]),
        'output_size': int(config.EMBEDDING_SIZES['city_id'][0]),
        'embedding_sizes': config.EMBEDDING_SIZES,
        'n_layers': 2,
        'dropout': 0.3,
        'rnn_dropout': 0.1,
        'tie_embedding_and_projection': True,
        'model_type': ModelType.MANY_TO_MANY,
        'recurrent_type': RecurrentType.GRU,
        'weight_type': WeightType.UNWEIGHTED,
        'feature_projection_type': FeatureProjectionType.CONCATENATION,
        'num_folds': config.N_SPLITS,
        'batch_size': config.BATCH_SIZE
    }

def get_experiments() ‑> List

An experiment is a dict that describes the parameters that will be overridden in the base configuration during an experiment.

Expand source code Browse git

def get_experiments() -> List:
    """
    An experiment is a dict that describes the parameters
    that will be overridden in the base configuration
    during an experiment.
    """
    params = ['model_type', 'weight_type', 'recurrent_type', 'tie_embedding_and_projection']
    return [
        dict(zip(params, p))
        for p in itertools.product(
            *map(list, [ModelType, WeightType, RecurrentType, [True, False]])
        )
    ]

def get_model_performance_data(test_set: pandas.core.frame.DataFrame, dataset_encoder: DatasetEncoder, model_hashes: List[str] = None) ‑> Tuple[pandas.core.frame.DataFrame, Dict]

Get model performance data from all trained models.

Expand source code Browse git

def get_model_performance_data(test_set: pd.DataFrame,
                               dataset_encoder: DatasetEncoder,
                               model_hashes: List[str] = None) -> Tuple[pd.DataFrame, Dict]:
    """
    Get model performance data from all trained models.
    """
    booking_dataset_test, dataset_loader_test = get_dataset_and_dataloader(
        df=test_set,
        features=config.FEATURES_EMBEDDING
    )
    ground_truth_test = get_ground_truth_from_dataset(
        df=test_set,
        booking_dataset=booking_dataset_test,
        dataset_encoder=dataset_encoder
    )

    trained_models = get_trained_models()

    if model_hashes:
        trained_models = {h: trained_models[h] for h in model_hashes}

    df_rows = []
    accuracy_at_4_by_length = {}
    for model_hash, model_parameters in trained_models.items():
        try:
            ckpt_list = get_model_ckpt_paths(model_hash=model_hash,
                                             checkpoint_type='accuracy_at_k')
        except FileNotFoundError:
            continue

        d = {
            'single': ckpt_list[:1],
            'ensemble': ckpt_list
        }

        for model_type, ckpt_list in d.items():
            if model_type == 'ensemble' and len(ckpt_list) == 1:
                continue

            model = BookingNet(**model_parameters).to(config.DEVICE)
            predictions = get_submission(booking_dataset_test,
                                         dataset_loader_test,
                                         model,
                                         ckpt_list,
                                         dataset_encoder)
            accuracy = accuracy_at_k(predictions, ground_truth_test)
            model_parameters['num_models'] = len(ckpt_list)
            model_parameters['accuracy@1'] = accuracy['accuracy@1']
            model_parameters['accuracy@4'] = accuracy['accuracy@4']
            model_parameters['accuracy@10'] = accuracy['accuracy@10']
            model_parameters['hash'] = model_hash
            df = pd.DataFrame.from_dict(model_parameters, orient='index').T
            df_rows.append(pd.concat(
                [df, pd.DataFrame.from_dict(accuracy['accuracy@4_by_pos'], orient='index').T]
                , axis=1))
            accuracy_at_4_by_length[(model_hash, model_type)] = accuracy['accuracy@4_by_pos']
    return pd.concat(df_rows), accuracy_at_4_by_length

def run_experiments(base_configuration: Dict, experiments: List[Dict], n_models: int, dataset_batches: List[Dict[str, torch.Tensor]], train_set: pandas.core.frame.DataFrame, skip_checkpoint=False) ‑> NoneType

Given a base configuration in a dictionary, run experiments by overriding parameters of this base configuration with a list of overrides in experiments.

Expand source code Browse git

def run_experiments(base_configuration: Dict,
                    experiments: List[Dict],
                    n_models: int,
                    dataset_batches: List[BatchType],
                    train_set: pd.DataFrame,
                    skip_checkpoint=False) -> None:
    """
    Given a base configuration in a dictionary, run experiments
    by overriding parameters of this base configuration with
    a list of overrides in `experiments`.
    """
    for model_overrides in tqdm(experiments):
        logging.info(model_overrides)
        model_configuration = dict(base_configuration, **model_overrides)
        train_model_for_folds(dataset_batches,
                              train_set,
                              model_configuration,
                              n_models=n_models,
                              skip_checkpoint=skip_checkpoint)