From 80ad3e58e3fb1a74ab5530d8bc42b8cac8058aa5 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Fri, 5 May 2023 13:13:11 +0000 Subject: automatic import of python-e2eml --- .gitignore | 1 + python-e2eml.spec | 2082 +++++++++++++++++++++++++++++++++++++++++++++++++++++ sources | 1 + 3 files changed, 2084 insertions(+) create mode 100644 python-e2eml.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69de29..9e9f043 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +/e2eml-4.14.20.tar.gz diff --git a/python-e2eml.spec b/python-e2eml.spec new file mode 100644 index 0000000..a9eb05c --- /dev/null +++ b/python-e2eml.spec @@ -0,0 +1,2082 @@ +%global _empty_manifest_terminate_build 0 +Name: python-e2eml +Version: 4.14.20 +Release: 1 +Summary: An end-to-end solution for automl +License: GPL-3.0-only +URL: https://github.com/ThomasMeissnerDS/e2e_ml +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/0c/d2/3d4b828278589463bda0e825353de6fa485228b0cdbee1a446ac3ca524bb/e2eml-4.14.20.tar.gz +BuildArch: noarch + +Requires: python3-psutil +Requires: python3-boostaroota +Requires: python3-catboost +Requires: python3-category_encoders +Requires: python3-datasets +Requires: python3-dill +Requires: python3-imbalanced-learn +Requires: python3-lightgbm +Requires: python3-matplotlib +Requires: python3-ngboost +Requires: python3-nltk +Requires: python3-numpy +Requires: python3-optuna +Requires: python3-pandas +Requires: python3-plotly +Requires: python3-pytorch_tabnet +Requires: python3-seaborn +Requires: python3-scikit-learn +Requires: python3-scipy +Requires: python3-shap +Requires: python3-spacy +Requires: python3-textblob +Requires: python3-torch +Requires: python3-transformers +Requires: python3-vowpalwabbit +Requires: python3-xgboost +Requires: python3-cupy +Requires: python3-cython +Requires: python3-ipython +Requires: python3-notebook + +%description +# e2e ML + +> An end to end solution for automl. + +Pass in your data, add some information about it and get a full pipelines in +return. Data preprocessing, feature creation, modelling and evaluation with just +a few lines of code. + +![Header image](header.png) + +## Contents + + + +* [Installation](#installation) +* [Usage example](#usage-example) +* [Linting and Pre-Commit](#linting-and-pre-commit) +* [Disclaimer](#disclaimer) +* [Development](#development) + * [Adding or Removing Dependencies](#adding-or-removing-dependencies) + * [Building and Publishing](#building-and-publishing) + * [Documentation](#documentation) + * [Pull Requests](#pull-requests) +* [Release History](#release-history) +* [References](#references) +* [Meta](#meta) + + + +## Installation + +From PyPI: + +```sh +pip install e2eml +``` + +We highly recommend to create a new virtual environment first. Then install +e2e-ml into it. In the environment also download the pretrained spacy model +with. Otherwise e2eml will do this automatically during runtime. + +e2eml can also be installed into a RAPIDS environment. For this we recommend to +create a fresh environment following [RAPIDS](https://rapids.ai/start.html) +instructions. After environment installation and activation, a special +installation is needed to not run into installation issues. + +Just run: + +```sh +pip install e2eml[rapids] +``` + +This will additionally install cupy and cython to prevent issues. Additionally +it is needed to follow Pytorch [installation instructions](https://pytorch.org/get-started/locally/). +When installing RAPIDs, Pytorch & Spacy for GPU, it is recommended to look +for supported Cuda versions in +all three. If Pytorch related parts fail on runtime, it is recommended to +reinstall a new environment and install Pytorch using pip rather than conda. + +```sh +# also spacy supports GPU acceleration +pip install -U spacy[cuda112] #cuda112 depends on your actual cuda version, see: https://spacy.io/usage +``` + +Otherwise Pytorch will fail trying to run on GPU. + +If e2eml shall be installed together with Jupyter core and ipython, please +install with: + +```sh +pip install e2eml[full] +``` + +instead. + +## Usage example + +e2e has been designed to create state-of-the-art machine learning pipelines with +a few lines of code. Basic example of usage: + +```python +import e2eml +from e2eml.classification import classification_blueprints +import pandas as pd +# import data +df = pd.read_csv("Your.csv") + +# split into a test/train & holdout set (holdout for prediction illustration here, but not required at all) +train_df = df.head(1000).copy() +holdout_df = df.tail(200).copy() # make sure +# saving the holdout dataset's target for later and delete it from holdout dataset +target = "target_column" +holdout_target = holdout_df[target].copy() +del holdout_df[target] + +# instantiate the needed blueprints class +from classification import classification_blueprints # regression bps are available with from regression import regression_blueprints +test_class = classification_blueprints.ClassificationBluePrint(datasource=train_df, + target_variable=target, + train_split_type='cross', + rapids_acceleration=True, # if installed into a conda environment with NVIDIA Rapids, this can be used to accelerate preprocessing with GPU + preferred_training_mode='auto', # Auto will automatically identify, if LGBM & Xgboost can use GPU acceleration* + tune_mode='accurate' # hyperparameter sets will be validated with 10-fold CV Set this to 'simple' for 1-fold CV + #categorical_columns=cat_columns # you can define categorical columns, otherwise e2e does this automatically + #date_columns=date_columns # you can also define date columns (expected is YYYY-MM-DD format) + ) + +""" +* +'Auto' is recommended for preferred_training_mode parameter, but with 'CPU' and 'GPU' it can also be controlled manually. +If you install Xgboost & LGBM into the same environment as GPU accelerated versions, you can set preferred_training_mode='gpu'. +This will massively improve training times and speed up SHAP feature importance for LGBM and Xgboost related tasks. +For Xgboost this should work out of the box, if installed into a RAPIDS environment. +""" +# run actual blueprint +test_class.ml_bp01_multiclass_full_processing_xgb_prob() + +""" +When choosing blueprints several options are available: + +Multiclass blueprints can handle binary and multiclass tasks: +- ml_bp00_train_test_binary_full_processing_log_reg_prob() +- ml_bp01_multiclass_full_processing_xgb_prob() +- ml_bp02_multiclass_full_processing_lgbm_prob() +- ml_bp03_multiclass_full_processing_sklearn_stacking_ensemble() +- ml_bp04_multiclass_full_processing_ngboost() +- ml_bp05_multiclass_full_processing_vowpal_wabbit +- ml_bp06_multiclass_full_processing_bert_transformer() # for NLP specifically +- ml_bp07_multiclass_full_processing_tabnet() +- ml_bp08_multiclass_full_processing_ridge() +- ml_bp09_multiclass_full_processing_catboost() +- ml_bp10_multiclass_full_processing_sgd() +- ml_bp11_multiclass_full_processing_quadratic_discriminant_analysis() +- ml_bp12_multiclass_full_processing_svm() +- ml_bp13_multiclass_full_processing_multinomial_nb() +- ml_bp14_multiclass_full_processing_lgbm_focal() +- ml_bp16_multiclass_full_processing_neural_network() # offers fully connected ANN & 1D CNN +- ml_special_binary_full_processing_boosting_blender() +- ml_special_multiclass_auto_model_exploration() +- ml_special_multiclass_full_processing_multimodel_max_voting() + +There are regression blueprints as well (in regression module): +- ml_bp10_train_test_regression_full_processing_linear_reg() +- ml_bp11_regression_full_processing_xgboost() +- ml_bp12_regressions_full_processing_lgbm() +- ml_bp13_regression_full_processing_sklearn_stacking_ensemble() +- ml_bp14_regressions_full_processing_ngboost() +- ml_bp15_regression_full_processing_vowpal_wabbit_reg() +- ml_bp16_regressions_full_processing_bert_transformer() +- ml_bp17_regression_full_processing_tabnet_reg() +- ml_bp18_regression_full_processing_ridge_reg() +- ml_bp19_regression_full_processing_elasticnet_reg() +- ml_bp20_regression_full_processing_catboost() +- ml_bp20_regression_full_processing_sgd() +- ml_bp21_regression_full_processing_ransac() +- ml_bp22_regression_full_processing_svm() +- ml_bp23_regressions_full_processing_neural_network() # offers fully connected ANN & 1D CNN +- ml_special_regression_full_processing_multimodel_avg_blender() +- ml_special_regression_auto_model_exploration() + +In the time series module we recently embedded blueprints as well: +- ml_bp100_univariate_timeseries_full_processing_auto_arima() +- ml_bp101_multivariate_timeseries_full_processing_lstm() +- ml_bp102_multivariate_timeseries_full_processing_tabnet() +- ml_bp103_multivariate_timeseries_full_processing_rnn() +- ml_bp104_univariate_timeseries_full_processing_holt_winters() + +Time series blueprints use less preprocessing on default and cannot use all options like +classification and regression models. Non-time series algorithms like TabNet are different +to their regression counterpart as cross validation is replaced by time series splits and +data scaling covers the target variable as well. + +In ensembles algorithms can be chosen via the class attribute: +test_class.special_blueprint_algorithms = {"ridge": True, + "elasticnet": False, + "xgboost": True, + "ngboost": True, + "lgbm": True, + "tabnet": False, + "vowpal_wabbit": True, + "sklearn_ensemble": True, + "catboost": False + } + +Also preprocessing steps can be selected: +test_class.blueprint_step_selection_non_nlp = { + "automatic_type_detection_casting": True, + "remove_duplicate_column_names": True, + "reset_dataframe_index": True, + "fill_infinite_values": True, + "early_numeric_only_feature_selection": True, + "delete_high_null_cols": True, + "data_binning": True, + "regex_clean_text_data": False, + "handle_target_skewness": False, + "datetime_converter": True, + "pos_tagging_pca": False, # slow with many categories + "append_text_sentiment_score": False, + "tfidf_vectorizer_to_pca": False, # slow with many categories + "tfidf_vectorizer": False, + "rare_feature_processing": True, + "cardinality_remover": True, + "categorical_column_embeddings": False, + "holistic_null_filling": True, # slow + "numeric_binarizer_pca": True, + "onehot_pca": True, + "category_encoding": True, + "fill_nulls_static": True, + "autoencoder_outlier_detection": True, + "outlier_care": True, + "delete_outliers": False, + "remove_collinearity": True, + "skewness_removal": True, + "automated_feature_transformation": False, + "random_trees_embedding": False, + "clustering_as_a_feature_dbscan": True, + "clustering_as_a_feature_kmeans_loop": True, + "clustering_as_a_feature_gaussian_mixture_loop": True, + "pca_clustering_results": True, + "svm_outlier_detection_loop": False, + "autotuned_clustering": False, + "reduce_memory_footprint": False, + "scale_data": True, + "smote": False, + "automated_feature_selection": True, + "bruteforce_random_feature_selection": False, # slow + "autoencoder_based_oversampling": False, + "synthetic_data_augmentation": False, + "final_pca_dimensionality_reduction": False, + "final_kernel_pca_dimensionality_reduction": False, + "delete_low_variance_features": False, + "shap_based_feature_selection": False, + "delete_unpredictable_training_rows": False, + "trained_tokenizer_embedding": False, + "sort_columns_alphabetically": True, + "use_tabular_gan": False, + } + +The bruteforce_random_feature_selection step is experimental. It showed promising results. The number of trials can be controlled. +This step is useful, if the model overfitted (which should happen rarely), because too many features with too little +feature importance have been considered. +like test_class.hyperparameter_tuning_rounds["bruteforce_random"] = 400 . + +Generally the class instance is a control center and gives room for plenty of customization. +Never update the class attributes like shown below. + +test_class.tabnet_settings = "batch_size": rec_batch_size, + "virtual_batch_size": virtual_batch_size, + # pred batch size? + "num_workers": 0, + "max_epochs": 1000} + +test_class.hyperparameter_tuning_rounds = { + "xgboost": 100, + "lgbm": 500, + "lgbm_focal": 50, + "tabnet": 25, + "ngboost": 25, + "sklearn_ensemble": 10, + "ridge": 500, + "elasticnet": 100, + "catboost": 25, + "sgd": 2000, + "svm": 50, + "svm_regression": 50, + "ransac": 50, + "multinomial_nb": 100, + "bruteforce_random": 400, + "synthetic_data_augmentation": 100, + "autoencoder_based_oversampling": 200, + "final_kernel_pca_dimensionality_reduction": 50, + "final_pca_dimensionality_reduction": 50, + "auto_arima": 50, + "holt_winters": 50, + } + +test_class.hyperparameter_tuning_max_runtime_secs = { + "xgboost": 2 * 60 * 60, + "lgbm": 2 * 60 * 60, + "lgbm_focal": 2 * 60 * 60, + "tabnet": 2 * 60 * 60, + "ngboost": 2 * 60 * 60, + "sklearn_ensemble": 2 * 60 * 60, + "ridge": 2 * 60 * 60, + "elasticnet": 2 * 60 * 60, + "catboost": 2 * 60 * 60, + "sgd": 2 * 60 * 60, + "svm": 2 * 60 * 60, + "svm_regression": 2 * 60 * 60, + "ransac": 2 * 60 * 60, + "multinomial_nb": 2 * 60 * 60, + "bruteforce_random": 2 * 60 * 60, + "synthetic_data_augmentation": 1 * 60 * 60, + "autoencoder_based_oversampling": 2 * 60 * 60, + "final_kernel_pca_dimensionality_reduction": 4 * 60 * 60, + "final_pca_dimensionality_reduction": 2 * 60 * 60, + "auto_arima": 2 * 60 * 60, + "holt_winters": 2 * 60 * 60, + } + +When these parameters have to updated, please overwrite the keys individually to not break the blueprints eventually. +I.e.: test_class.hyperparameter_tuning_max_runtime_secs["xgboost"] = 12*60*60 would work fine. + +Working with big data can bring all hardware to it's needs. e2eml has been tested with: +- Ryzen 5950x (16 cores CPU) +- Geforce RTX 3090 (24GB VRAM) +- 64GB RAM +e2eml has been able to process 100k rows with 200 columns approximately using these specs stable for non-blended +blueprints. Blended blueprints consume more resources as e2eml keep the trained models in memory as of now. + +For data bigger than 100k rows it is possible to limit the amount of data for various preprocessing steps: +- test_class.feature_selection_sample_size = 100000 # for feature selection +- test_class.hyperparameter_tuning_sample_size = 100000 # for model hyperparameter optimization +- test_class.brute_force_selection_sample_size = 15000 # for an experimental feature selection + +For binary classification a sample size of 100k datapoints is sufficient in most cases. +Hyperparameter tuning sample size can be much less, +depending on class imbalance. + +For multiclass we recommend to start with small samples as algorithms like Xgboost and LGBM will +easily grow in memory consumption +with growing number of classes. LGBM focal or neural network will be good starts here. + +Whenever classes are imbalanced (binary & multiclass) we recommend to use the preprocessing step +"autoencoder_based_oversampling". +""" +# After running the blueprint the pipeline is done. I can be saved with: +save_to_production(test_class, file_name='automl_instance') + +# The blueprint can be loaded with +loaded_test_class = load_for_production(file_name='automl_instance') + +# predict on new data (in this case our holdout) with loaded blueprint +loaded_test_class.ml_bp01_multiclass_full_processing_xgb_prob(holdout_df) + +# predictions can be accessed via a class attribute +print(churn_class.predicted_classes['xgboost']) +``` + +## Linting and Pre-Commit + +This project uses pre-commit to enforce style. + +To install the pre-commit hooks, first install pre-commit into the project's +virtual environment: + +```sh +pip install pre-commit +``` + +Then install the project hooks: + +```sh +pre-commit install +``` + +Now, whenever you make a commit, the linting and autoformatting will +automatically run. + +## Disclaimer + +e2e is not designed to quickly iterate over several algorithms and suggest you +the best. It is made to deliver state-of-the-art performance as ready-to-go +blueprints. e2e-ml blueprints contain: + +* preprocessing (outlier, rare feature, datetime, categorical and NLP handling) +* feature creation (binning, clustering, categorical and NLP features) +* automated feature selection +* model training (with crossfold validation) +* automated hyperparameter tuning +* model evaluation + +This comes at the cost of runtime. Depending on your data we recommend strong +hardware. + +## Development + +This project uses [poetry](https://python-poetry.org/). + +To install the project for development, run: + +```sh +poetry install +``` + +This will install all dependencies and development dependencies into a virtual +environment. + +### Adding or Removing Dependencies + +To add or remove a dependency, use `poetry add ` or +`poetry remove ` respectively. Use the `--dev` flag for development +dependencies. + +### Building and Publishing + +To build and publish the project, run + +```sh +poetry publish --build +``` + +### Documentation + +This project comes with documentation. To build the docs, run: + +```sh +cd docs +make docs +``` + +You may then browse the HTML docs at `docs/build/docs/index.html`. + +### Pull Requests + +We welcome Pull Requests! Please make a PR against the `develop` branch. + +## Release History + +* 4.14.0 + * Update Python version to support also 3.9 + * Updated import for Pandas' SettingWithCopyWarning warnings +* 4.12.00 + * Added fully connected NN for regression with quantile loss + * Fixed wrong assignment in RNN model + * Adjusted default preprocessing steps for regression tasks + * Shuffling is disabled automatically for all time_series ml_task instances + * LSTM & RNN default settings will automatically adjust to a more complex architecture, + * if more than 50 features have been detected +* 4.00.50 + * Added Autoarima & Holt winters for univariate time series predictions + * Added LSTM & RNN for uni- & multivariate time series prediction + * Autotuned NNs, LSTM and NLP transformers got an extra setting to set how + many models shall be created + * All tabular NNs (except NLPs) store predicted probabilities now + (binary classifiers will blend them when + creation of multiple modls has ben specified) + * Optimized preprocessing order +* 3.02.00 + * Refined GAN architectures + * Categorical encoding can be chosen via the cat_encoder_model attribute now + * Fixed a bug when choosing onehot encoding + * Optimized autoencoder based oversampling for regression + * Added Autoencoder based oversampling + * Optimized clustering performance +* 2.50 + * Added tabular GAN (experimental) + * Minor bug fixes +* 2.13 + * Added neural networks (ANN & soft ordered 1d-CNN) for tabular data + * Added attribute global_random_state to set state for all instances + * Added attribute shuffle_during_training to be able to disable shuffling + during model training (does not apply to all models) +* 2.12 + * Added RAPIDS support for SVM regression + * Updated Xgboost loss function for regression + * Fixed a bug in cardinality removal +* 2.11 + * Added datasets library to dependencies + * Calculation of feature importance can be controlled via class instance now. + This is helpful when using TF-IDF matrices where 10-fold permutation test + run out of memory + * Fixed loading of BERT weights from manual path + * DEESC parameters can be controlled via class attributes now + * Fixed a bug with LGBM on regression tasks + * Adjusted RAPIDS based clustering for use with RAPIDS version 21.12 + * Added RAPIDS as accelerator for feature transformation exploration + * Performance optimization for clustering & numerical binarizer + * Added random states to clustering & PCA implementations + * Improved scaling + * Stabilized TabNet for regression +* 2.10.04 + * Adjusted dependency for SHAP + * Fixed a bug where early numeric feature selection failed due to + the absence of numerical features +* 2.10.03 + * Adjusted dependencies for Pandas, Spacy, Optuna, Setuptools, Transformers +* 2.10.01 + * Added references & citations to Readme + * Added is_imbalanced flag to Timewalk + * Removed babel from dependencies & updated some of them +* 2.9.96 + * Timewalk got adjustments + * Fixed a bug where row deletion has been incompatible with Tabnet +* 2.9.95 + * SHAP based feature selection increased to 20 folds (from 10) + * less unnecessary print outs +* 2.9.93 + * Added SHAP based feature selection + * Removed Xgboost from Timewalk as default due to computational and runtime costs + * Suppress all warnings of LGBM focal during multiclass tasks +* 2.9.92 + * e2eml uses poetry + * introduction of Github actions to check linting + * bug fix of LGBM focal failing due to missing hyperparameter tuning specifications + * preparation for Readthedocs implementation +* 2.9.9 + * Added Multinomial Bayes Classifier + * Added SVM for regression + * Refined Sklearn ensembles +* 2.9.8 + * Added Quadrant Discriminent Analysis + * Added Support Vector machines + * Added Ransac regressor +* 2.9.7 + * updated Plotly dependency to 5.4.0 + * Improved Xgboost for imbalanced data +* 2.9.6 + * Added TimeTravel and timewalk: TimeTravel will save the class instance after + each preprocessing step, timewalk will automatically try different + preprocessing steps with different algorithms to find the best combination + * Updated dependencies to use newest versions of scikit-learn and + category-encoders +* 2.9.0 + * bug fixes with synthetic data augmentation for regression + * bug fix of target encoding during regression + * enhanced hyperparameter space for autoencoder based oversampling + * added final PCA dimensionality reduction as optional preprocessing step +* 2.8.1 + * autoencoder based oversampling will go through hyperprameter tuning first + (for each class individually) + * optimized TabNet performance +* 2.7.5 + * added oversampling based on variational autoencoder (experimental) +* 2.7.4 + * fixed target encoding for multiclass classification + * improved performance on multiclass tasks + * improved Xgboost & TabNet performance on binary classification + * added auto-tuned clustering as a feature +* 2.6.3 + * small bugfixes +* 2.6.1 + * Hyperparameter tuning does happen on a sample of the train data from now on + (sample size can be controlled) + * An experimental feature has been added, which tries to find unpredictable + training data rows to delete them from the training (this accelerates + training, but costs a bit model performance) + * Blueprints can be accelerated with Nvidia RAPIDS (works on clustering only f + or now) +* 2.5.9 + * optimized loss function for TabNet +* 2.5.1 + * Optimized loss function for synthetic data augmentation + * Adjusted library dependencies + * Improved target encoding +* 2.3.1 + * Changed feature selection backend from Xgboost to LGBM + * POS tagging is off on default from this version +* 2.2.9 + * bug fixes + * added an experimental feature to optimize training data with synthetic data + * added optional early feature selection (numeric only) +* 2.2.2 + * transformers can be loaded into Google Colab from Gdrive +* 2.1.2 + * Improved TFIDF vectorizer performance & non transformer NLP applications + * Improved POS tagging stability +* 2.1.1 + * Completely overworked preprocessing setup (changed API). Preprocessing + blueprints can be customized through a class attribute now + * Completely overworked special multimodel blueprints. The paricipating + algorithms can be customized through a class attribute now + * Improved NULL handling & regression performance + * Added Catboost & Elasticnet + * Updated Readme + * First unittests + * Added Stochastic Gradient classifier & regressor +* 1.8.2 + * Added Ridge classifier and regression as new blueprints +* 1.8.1 + * Added another layer of feature selection +* 1.8.0 + * Transformer padding length will be max text length + 20% instead of static + 300 + * Transformers use AutoModelForSequenceClassification instead of hardcoded + transformers now + * Hyperparameter tuning rounds and timeout can be controlled globally via + class attribute now +* 1.7.8 + * Instead of a global probability threshold, e2eml stores threshold for each + tested model + * Deprecated binary boosting blender due to lack of performance + * Added filling of inf values +* 1.7.3 + * Improved preprocessing + * Improved regression performance + * Deprecated regression boosting blender and replaced my multi + model/architecture blender + * Transformers can optionally discard worst models, but will keep all 5 by + default + * e2eml should be installable on Amazon Sagemaker now +* 1.7.0 + * Added TabNet classifier and regressor with automated hyperparameter + optimization +* 1.6.5 + * improvements of NLP transformers +* 1.5.8 + * Fixes bug around preprocessing_type='nlp' + * replaced pickle with dill for saving and loading objects +* 1.5.3 + * Added transformer blueprints for NLP classification and regression + * renamed Vowpal Wabbit blueprint to fit into blueprint naming convention + * Created "extras" options for library installation: 'rapids' installs extras, + so e2eml can be installed into into a rapids environment while 'jupyter' + adds jupyter core and ipython. 'full' installs all of them. +* 1.3.9 + * Fixed issue with automated GPU-acceleration detection and flagging + * Fixed avg regression blueprint where eval function tried to call + classification evaluation + * Moved POS tagging + PCA step into non-NLP pipeline as it showed good results + in general + * improved NLP part (more and better feature engineering and preprocessing) of + blueprints for better performance + * Added Vowpal Wabbit for classification and regression and replaced stacking + ensemble in automated model exploration by Vowpal Wabbit as well + * Set random_state for train_test splits for consistency + * Fixed sklearn dependency to 0.22.0 due to six import error +* 1.0.1 + * Optimized package requirements + * Pinned LGBM requirement to version 3.1.0 due to the bug "LightGBMError: bin + size 257 cannot run on GPU #3339" +* 0.9.9 + * Enabled tune_mode parameter during class instantiation. + * Updated docstings across all functions and changed model defaults. + * Multiple bug fixes (LGBM regression accurate mode, label encoding and + permutation tests). + * Enhanced user information & better ROC_AUC display + * Added automated GPU detection for LGBM and Xgboost. + * Added functions to save and load blueprints + * architectural changes (preprocessing organized in blueprints as well) +* 0.9.4 + * First release with classification and regression blueprints. (not available + anymore) + +## References + +* Focal loss + * [Focal loss for LGBM](https://maxhalford.github.io/blog/lightgbm-focal-loss/#first-order-derivative) + * [Focal loss for LGBM multiclass](https://towardsdatascience.com/multi-class-classification-using-focal-loss-and-lightgbm-a6a6dec28872) +* Autoencoder + * [Variational Autoencoder for imbalanced data](https://github.com/lschmiddey/Autoencoder/blob/master/VAE_for_imbalanced_data.ipynb) +* Target Encoding + * [Target encoding for multiclass](https://towardsdatascience.com/target-encoding-for-multi-class-classification-c9a7bcb1a53) +* Pytorch-TabNet + * [Arik, S. O., & Pfister, T. (2019). TabNet: Attentive Interpretable Tabular Learning. arXiv preprint arXiv:1908.07442.](https://arxiv.org/pdf/1908.07442.pdf) + * [Implementing TabNet in Pytorch](https://towardsdatascience.com/implementing-tabnet-in-pytorch-fc977c383279) +* Ngboost + * [NGBoost: Natural Gradient Boosting for Probabilistic Prediction, arXiv:1910.03225](https://arxiv.org/abs/1910.03225) +* Vowpal Wabbit + * [Vowpal Wabbit Research overview](https://vowpalwabbit.org/research.html) + +## Meta + +Creator: Thomas Meißner – [LinkedIn](https://www.linkedin.com/in/thomas-mei%C3%9Fner-m-a-3808b346) + +Consultant: Gabriel Stephen Alexander – [Github](https://github.com/bitsofsteve) + +Special thanks to: Alex McKenzie - [LinkedIn](https://de.linkedin.com/in/alex-mckenzie) + +[e2eml Github repository](https://github.com/ThomasMeissnerDS/e2e_ml) + + +%package -n python3-e2eml +Summary: An end-to-end solution for automl +Provides: python-e2eml +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-e2eml +# e2e ML + +> An end to end solution for automl. + +Pass in your data, add some information about it and get a full pipelines in +return. Data preprocessing, feature creation, modelling and evaluation with just +a few lines of code. + +![Header image](header.png) + +## Contents + + + +* [Installation](#installation) +* [Usage example](#usage-example) +* [Linting and Pre-Commit](#linting-and-pre-commit) +* [Disclaimer](#disclaimer) +* [Development](#development) + * [Adding or Removing Dependencies](#adding-or-removing-dependencies) + * [Building and Publishing](#building-and-publishing) + * [Documentation](#documentation) + * [Pull Requests](#pull-requests) +* [Release History](#release-history) +* [References](#references) +* [Meta](#meta) + + + +## Installation + +From PyPI: + +```sh +pip install e2eml +``` + +We highly recommend to create a new virtual environment first. Then install +e2e-ml into it. In the environment also download the pretrained spacy model +with. Otherwise e2eml will do this automatically during runtime. + +e2eml can also be installed into a RAPIDS environment. For this we recommend to +create a fresh environment following [RAPIDS](https://rapids.ai/start.html) +instructions. After environment installation and activation, a special +installation is needed to not run into installation issues. + +Just run: + +```sh +pip install e2eml[rapids] +``` + +This will additionally install cupy and cython to prevent issues. Additionally +it is needed to follow Pytorch [installation instructions](https://pytorch.org/get-started/locally/). +When installing RAPIDs, Pytorch & Spacy for GPU, it is recommended to look +for supported Cuda versions in +all three. If Pytorch related parts fail on runtime, it is recommended to +reinstall a new environment and install Pytorch using pip rather than conda. + +```sh +# also spacy supports GPU acceleration +pip install -U spacy[cuda112] #cuda112 depends on your actual cuda version, see: https://spacy.io/usage +``` + +Otherwise Pytorch will fail trying to run on GPU. + +If e2eml shall be installed together with Jupyter core and ipython, please +install with: + +```sh +pip install e2eml[full] +``` + +instead. + +## Usage example + +e2e has been designed to create state-of-the-art machine learning pipelines with +a few lines of code. Basic example of usage: + +```python +import e2eml +from e2eml.classification import classification_blueprints +import pandas as pd +# import data +df = pd.read_csv("Your.csv") + +# split into a test/train & holdout set (holdout for prediction illustration here, but not required at all) +train_df = df.head(1000).copy() +holdout_df = df.tail(200).copy() # make sure +# saving the holdout dataset's target for later and delete it from holdout dataset +target = "target_column" +holdout_target = holdout_df[target].copy() +del holdout_df[target] + +# instantiate the needed blueprints class +from classification import classification_blueprints # regression bps are available with from regression import regression_blueprints +test_class = classification_blueprints.ClassificationBluePrint(datasource=train_df, + target_variable=target, + train_split_type='cross', + rapids_acceleration=True, # if installed into a conda environment with NVIDIA Rapids, this can be used to accelerate preprocessing with GPU + preferred_training_mode='auto', # Auto will automatically identify, if LGBM & Xgboost can use GPU acceleration* + tune_mode='accurate' # hyperparameter sets will be validated with 10-fold CV Set this to 'simple' for 1-fold CV + #categorical_columns=cat_columns # you can define categorical columns, otherwise e2e does this automatically + #date_columns=date_columns # you can also define date columns (expected is YYYY-MM-DD format) + ) + +""" +* +'Auto' is recommended for preferred_training_mode parameter, but with 'CPU' and 'GPU' it can also be controlled manually. +If you install Xgboost & LGBM into the same environment as GPU accelerated versions, you can set preferred_training_mode='gpu'. +This will massively improve training times and speed up SHAP feature importance for LGBM and Xgboost related tasks. +For Xgboost this should work out of the box, if installed into a RAPIDS environment. +""" +# run actual blueprint +test_class.ml_bp01_multiclass_full_processing_xgb_prob() + +""" +When choosing blueprints several options are available: + +Multiclass blueprints can handle binary and multiclass tasks: +- ml_bp00_train_test_binary_full_processing_log_reg_prob() +- ml_bp01_multiclass_full_processing_xgb_prob() +- ml_bp02_multiclass_full_processing_lgbm_prob() +- ml_bp03_multiclass_full_processing_sklearn_stacking_ensemble() +- ml_bp04_multiclass_full_processing_ngboost() +- ml_bp05_multiclass_full_processing_vowpal_wabbit +- ml_bp06_multiclass_full_processing_bert_transformer() # for NLP specifically +- ml_bp07_multiclass_full_processing_tabnet() +- ml_bp08_multiclass_full_processing_ridge() +- ml_bp09_multiclass_full_processing_catboost() +- ml_bp10_multiclass_full_processing_sgd() +- ml_bp11_multiclass_full_processing_quadratic_discriminant_analysis() +- ml_bp12_multiclass_full_processing_svm() +- ml_bp13_multiclass_full_processing_multinomial_nb() +- ml_bp14_multiclass_full_processing_lgbm_focal() +- ml_bp16_multiclass_full_processing_neural_network() # offers fully connected ANN & 1D CNN +- ml_special_binary_full_processing_boosting_blender() +- ml_special_multiclass_auto_model_exploration() +- ml_special_multiclass_full_processing_multimodel_max_voting() + +There are regression blueprints as well (in regression module): +- ml_bp10_train_test_regression_full_processing_linear_reg() +- ml_bp11_regression_full_processing_xgboost() +- ml_bp12_regressions_full_processing_lgbm() +- ml_bp13_regression_full_processing_sklearn_stacking_ensemble() +- ml_bp14_regressions_full_processing_ngboost() +- ml_bp15_regression_full_processing_vowpal_wabbit_reg() +- ml_bp16_regressions_full_processing_bert_transformer() +- ml_bp17_regression_full_processing_tabnet_reg() +- ml_bp18_regression_full_processing_ridge_reg() +- ml_bp19_regression_full_processing_elasticnet_reg() +- ml_bp20_regression_full_processing_catboost() +- ml_bp20_regression_full_processing_sgd() +- ml_bp21_regression_full_processing_ransac() +- ml_bp22_regression_full_processing_svm() +- ml_bp23_regressions_full_processing_neural_network() # offers fully connected ANN & 1D CNN +- ml_special_regression_full_processing_multimodel_avg_blender() +- ml_special_regression_auto_model_exploration() + +In the time series module we recently embedded blueprints as well: +- ml_bp100_univariate_timeseries_full_processing_auto_arima() +- ml_bp101_multivariate_timeseries_full_processing_lstm() +- ml_bp102_multivariate_timeseries_full_processing_tabnet() +- ml_bp103_multivariate_timeseries_full_processing_rnn() +- ml_bp104_univariate_timeseries_full_processing_holt_winters() + +Time series blueprints use less preprocessing on default and cannot use all options like +classification and regression models. Non-time series algorithms like TabNet are different +to their regression counterpart as cross validation is replaced by time series splits and +data scaling covers the target variable as well. + +In ensembles algorithms can be chosen via the class attribute: +test_class.special_blueprint_algorithms = {"ridge": True, + "elasticnet": False, + "xgboost": True, + "ngboost": True, + "lgbm": True, + "tabnet": False, + "vowpal_wabbit": True, + "sklearn_ensemble": True, + "catboost": False + } + +Also preprocessing steps can be selected: +test_class.blueprint_step_selection_non_nlp = { + "automatic_type_detection_casting": True, + "remove_duplicate_column_names": True, + "reset_dataframe_index": True, + "fill_infinite_values": True, + "early_numeric_only_feature_selection": True, + "delete_high_null_cols": True, + "data_binning": True, + "regex_clean_text_data": False, + "handle_target_skewness": False, + "datetime_converter": True, + "pos_tagging_pca": False, # slow with many categories + "append_text_sentiment_score": False, + "tfidf_vectorizer_to_pca": False, # slow with many categories + "tfidf_vectorizer": False, + "rare_feature_processing": True, + "cardinality_remover": True, + "categorical_column_embeddings": False, + "holistic_null_filling": True, # slow + "numeric_binarizer_pca": True, + "onehot_pca": True, + "category_encoding": True, + "fill_nulls_static": True, + "autoencoder_outlier_detection": True, + "outlier_care": True, + "delete_outliers": False, + "remove_collinearity": True, + "skewness_removal": True, + "automated_feature_transformation": False, + "random_trees_embedding": False, + "clustering_as_a_feature_dbscan": True, + "clustering_as_a_feature_kmeans_loop": True, + "clustering_as_a_feature_gaussian_mixture_loop": True, + "pca_clustering_results": True, + "svm_outlier_detection_loop": False, + "autotuned_clustering": False, + "reduce_memory_footprint": False, + "scale_data": True, + "smote": False, + "automated_feature_selection": True, + "bruteforce_random_feature_selection": False, # slow + "autoencoder_based_oversampling": False, + "synthetic_data_augmentation": False, + "final_pca_dimensionality_reduction": False, + "final_kernel_pca_dimensionality_reduction": False, + "delete_low_variance_features": False, + "shap_based_feature_selection": False, + "delete_unpredictable_training_rows": False, + "trained_tokenizer_embedding": False, + "sort_columns_alphabetically": True, + "use_tabular_gan": False, + } + +The bruteforce_random_feature_selection step is experimental. It showed promising results. The number of trials can be controlled. +This step is useful, if the model overfitted (which should happen rarely), because too many features with too little +feature importance have been considered. +like test_class.hyperparameter_tuning_rounds["bruteforce_random"] = 400 . + +Generally the class instance is a control center and gives room for plenty of customization. +Never update the class attributes like shown below. + +test_class.tabnet_settings = "batch_size": rec_batch_size, + "virtual_batch_size": virtual_batch_size, + # pred batch size? + "num_workers": 0, + "max_epochs": 1000} + +test_class.hyperparameter_tuning_rounds = { + "xgboost": 100, + "lgbm": 500, + "lgbm_focal": 50, + "tabnet": 25, + "ngboost": 25, + "sklearn_ensemble": 10, + "ridge": 500, + "elasticnet": 100, + "catboost": 25, + "sgd": 2000, + "svm": 50, + "svm_regression": 50, + "ransac": 50, + "multinomial_nb": 100, + "bruteforce_random": 400, + "synthetic_data_augmentation": 100, + "autoencoder_based_oversampling": 200, + "final_kernel_pca_dimensionality_reduction": 50, + "final_pca_dimensionality_reduction": 50, + "auto_arima": 50, + "holt_winters": 50, + } + +test_class.hyperparameter_tuning_max_runtime_secs = { + "xgboost": 2 * 60 * 60, + "lgbm": 2 * 60 * 60, + "lgbm_focal": 2 * 60 * 60, + "tabnet": 2 * 60 * 60, + "ngboost": 2 * 60 * 60, + "sklearn_ensemble": 2 * 60 * 60, + "ridge": 2 * 60 * 60, + "elasticnet": 2 * 60 * 60, + "catboost": 2 * 60 * 60, + "sgd": 2 * 60 * 60, + "svm": 2 * 60 * 60, + "svm_regression": 2 * 60 * 60, + "ransac": 2 * 60 * 60, + "multinomial_nb": 2 * 60 * 60, + "bruteforce_random": 2 * 60 * 60, + "synthetic_data_augmentation": 1 * 60 * 60, + "autoencoder_based_oversampling": 2 * 60 * 60, + "final_kernel_pca_dimensionality_reduction": 4 * 60 * 60, + "final_pca_dimensionality_reduction": 2 * 60 * 60, + "auto_arima": 2 * 60 * 60, + "holt_winters": 2 * 60 * 60, + } + +When these parameters have to updated, please overwrite the keys individually to not break the blueprints eventually. +I.e.: test_class.hyperparameter_tuning_max_runtime_secs["xgboost"] = 12*60*60 would work fine. + +Working with big data can bring all hardware to it's needs. e2eml has been tested with: +- Ryzen 5950x (16 cores CPU) +- Geforce RTX 3090 (24GB VRAM) +- 64GB RAM +e2eml has been able to process 100k rows with 200 columns approximately using these specs stable for non-blended +blueprints. Blended blueprints consume more resources as e2eml keep the trained models in memory as of now. + +For data bigger than 100k rows it is possible to limit the amount of data for various preprocessing steps: +- test_class.feature_selection_sample_size = 100000 # for feature selection +- test_class.hyperparameter_tuning_sample_size = 100000 # for model hyperparameter optimization +- test_class.brute_force_selection_sample_size = 15000 # for an experimental feature selection + +For binary classification a sample size of 100k datapoints is sufficient in most cases. +Hyperparameter tuning sample size can be much less, +depending on class imbalance. + +For multiclass we recommend to start with small samples as algorithms like Xgboost and LGBM will +easily grow in memory consumption +with growing number of classes. LGBM focal or neural network will be good starts here. + +Whenever classes are imbalanced (binary & multiclass) we recommend to use the preprocessing step +"autoencoder_based_oversampling". +""" +# After running the blueprint the pipeline is done. I can be saved with: +save_to_production(test_class, file_name='automl_instance') + +# The blueprint can be loaded with +loaded_test_class = load_for_production(file_name='automl_instance') + +# predict on new data (in this case our holdout) with loaded blueprint +loaded_test_class.ml_bp01_multiclass_full_processing_xgb_prob(holdout_df) + +# predictions can be accessed via a class attribute +print(churn_class.predicted_classes['xgboost']) +``` + +## Linting and Pre-Commit + +This project uses pre-commit to enforce style. + +To install the pre-commit hooks, first install pre-commit into the project's +virtual environment: + +```sh +pip install pre-commit +``` + +Then install the project hooks: + +```sh +pre-commit install +``` + +Now, whenever you make a commit, the linting and autoformatting will +automatically run. + +## Disclaimer + +e2e is not designed to quickly iterate over several algorithms and suggest you +the best. It is made to deliver state-of-the-art performance as ready-to-go +blueprints. e2e-ml blueprints contain: + +* preprocessing (outlier, rare feature, datetime, categorical and NLP handling) +* feature creation (binning, clustering, categorical and NLP features) +* automated feature selection +* model training (with crossfold validation) +* automated hyperparameter tuning +* model evaluation + +This comes at the cost of runtime. Depending on your data we recommend strong +hardware. + +## Development + +This project uses [poetry](https://python-poetry.org/). + +To install the project for development, run: + +```sh +poetry install +``` + +This will install all dependencies and development dependencies into a virtual +environment. + +### Adding or Removing Dependencies + +To add or remove a dependency, use `poetry add ` or +`poetry remove ` respectively. Use the `--dev` flag for development +dependencies. + +### Building and Publishing + +To build and publish the project, run + +```sh +poetry publish --build +``` + +### Documentation + +This project comes with documentation. To build the docs, run: + +```sh +cd docs +make docs +``` + +You may then browse the HTML docs at `docs/build/docs/index.html`. + +### Pull Requests + +We welcome Pull Requests! Please make a PR against the `develop` branch. + +## Release History + +* 4.14.0 + * Update Python version to support also 3.9 + * Updated import for Pandas' SettingWithCopyWarning warnings +* 4.12.00 + * Added fully connected NN for regression with quantile loss + * Fixed wrong assignment in RNN model + * Adjusted default preprocessing steps for regression tasks + * Shuffling is disabled automatically for all time_series ml_task instances + * LSTM & RNN default settings will automatically adjust to a more complex architecture, + * if more than 50 features have been detected +* 4.00.50 + * Added Autoarima & Holt winters for univariate time series predictions + * Added LSTM & RNN for uni- & multivariate time series prediction + * Autotuned NNs, LSTM and NLP transformers got an extra setting to set how + many models shall be created + * All tabular NNs (except NLPs) store predicted probabilities now + (binary classifiers will blend them when + creation of multiple modls has ben specified) + * Optimized preprocessing order +* 3.02.00 + * Refined GAN architectures + * Categorical encoding can be chosen via the cat_encoder_model attribute now + * Fixed a bug when choosing onehot encoding + * Optimized autoencoder based oversampling for regression + * Added Autoencoder based oversampling + * Optimized clustering performance +* 2.50 + * Added tabular GAN (experimental) + * Minor bug fixes +* 2.13 + * Added neural networks (ANN & soft ordered 1d-CNN) for tabular data + * Added attribute global_random_state to set state for all instances + * Added attribute shuffle_during_training to be able to disable shuffling + during model training (does not apply to all models) +* 2.12 + * Added RAPIDS support for SVM regression + * Updated Xgboost loss function for regression + * Fixed a bug in cardinality removal +* 2.11 + * Added datasets library to dependencies + * Calculation of feature importance can be controlled via class instance now. + This is helpful when using TF-IDF matrices where 10-fold permutation test + run out of memory + * Fixed loading of BERT weights from manual path + * DEESC parameters can be controlled via class attributes now + * Fixed a bug with LGBM on regression tasks + * Adjusted RAPIDS based clustering for use with RAPIDS version 21.12 + * Added RAPIDS as accelerator for feature transformation exploration + * Performance optimization for clustering & numerical binarizer + * Added random states to clustering & PCA implementations + * Improved scaling + * Stabilized TabNet for regression +* 2.10.04 + * Adjusted dependency for SHAP + * Fixed a bug where early numeric feature selection failed due to + the absence of numerical features +* 2.10.03 + * Adjusted dependencies for Pandas, Spacy, Optuna, Setuptools, Transformers +* 2.10.01 + * Added references & citations to Readme + * Added is_imbalanced flag to Timewalk + * Removed babel from dependencies & updated some of them +* 2.9.96 + * Timewalk got adjustments + * Fixed a bug where row deletion has been incompatible with Tabnet +* 2.9.95 + * SHAP based feature selection increased to 20 folds (from 10) + * less unnecessary print outs +* 2.9.93 + * Added SHAP based feature selection + * Removed Xgboost from Timewalk as default due to computational and runtime costs + * Suppress all warnings of LGBM focal during multiclass tasks +* 2.9.92 + * e2eml uses poetry + * introduction of Github actions to check linting + * bug fix of LGBM focal failing due to missing hyperparameter tuning specifications + * preparation for Readthedocs implementation +* 2.9.9 + * Added Multinomial Bayes Classifier + * Added SVM for regression + * Refined Sklearn ensembles +* 2.9.8 + * Added Quadrant Discriminent Analysis + * Added Support Vector machines + * Added Ransac regressor +* 2.9.7 + * updated Plotly dependency to 5.4.0 + * Improved Xgboost for imbalanced data +* 2.9.6 + * Added TimeTravel and timewalk: TimeTravel will save the class instance after + each preprocessing step, timewalk will automatically try different + preprocessing steps with different algorithms to find the best combination + * Updated dependencies to use newest versions of scikit-learn and + category-encoders +* 2.9.0 + * bug fixes with synthetic data augmentation for regression + * bug fix of target encoding during regression + * enhanced hyperparameter space for autoencoder based oversampling + * added final PCA dimensionality reduction as optional preprocessing step +* 2.8.1 + * autoencoder based oversampling will go through hyperprameter tuning first + (for each class individually) + * optimized TabNet performance +* 2.7.5 + * added oversampling based on variational autoencoder (experimental) +* 2.7.4 + * fixed target encoding for multiclass classification + * improved performance on multiclass tasks + * improved Xgboost & TabNet performance on binary classification + * added auto-tuned clustering as a feature +* 2.6.3 + * small bugfixes +* 2.6.1 + * Hyperparameter tuning does happen on a sample of the train data from now on + (sample size can be controlled) + * An experimental feature has been added, which tries to find unpredictable + training data rows to delete them from the training (this accelerates + training, but costs a bit model performance) + * Blueprints can be accelerated with Nvidia RAPIDS (works on clustering only f + or now) +* 2.5.9 + * optimized loss function for TabNet +* 2.5.1 + * Optimized loss function for synthetic data augmentation + * Adjusted library dependencies + * Improved target encoding +* 2.3.1 + * Changed feature selection backend from Xgboost to LGBM + * POS tagging is off on default from this version +* 2.2.9 + * bug fixes + * added an experimental feature to optimize training data with synthetic data + * added optional early feature selection (numeric only) +* 2.2.2 + * transformers can be loaded into Google Colab from Gdrive +* 2.1.2 + * Improved TFIDF vectorizer performance & non transformer NLP applications + * Improved POS tagging stability +* 2.1.1 + * Completely overworked preprocessing setup (changed API). Preprocessing + blueprints can be customized through a class attribute now + * Completely overworked special multimodel blueprints. The paricipating + algorithms can be customized through a class attribute now + * Improved NULL handling & regression performance + * Added Catboost & Elasticnet + * Updated Readme + * First unittests + * Added Stochastic Gradient classifier & regressor +* 1.8.2 + * Added Ridge classifier and regression as new blueprints +* 1.8.1 + * Added another layer of feature selection +* 1.8.0 + * Transformer padding length will be max text length + 20% instead of static + 300 + * Transformers use AutoModelForSequenceClassification instead of hardcoded + transformers now + * Hyperparameter tuning rounds and timeout can be controlled globally via + class attribute now +* 1.7.8 + * Instead of a global probability threshold, e2eml stores threshold for each + tested model + * Deprecated binary boosting blender due to lack of performance + * Added filling of inf values +* 1.7.3 + * Improved preprocessing + * Improved regression performance + * Deprecated regression boosting blender and replaced my multi + model/architecture blender + * Transformers can optionally discard worst models, but will keep all 5 by + default + * e2eml should be installable on Amazon Sagemaker now +* 1.7.0 + * Added TabNet classifier and regressor with automated hyperparameter + optimization +* 1.6.5 + * improvements of NLP transformers +* 1.5.8 + * Fixes bug around preprocessing_type='nlp' + * replaced pickle with dill for saving and loading objects +* 1.5.3 + * Added transformer blueprints for NLP classification and regression + * renamed Vowpal Wabbit blueprint to fit into blueprint naming convention + * Created "extras" options for library installation: 'rapids' installs extras, + so e2eml can be installed into into a rapids environment while 'jupyter' + adds jupyter core and ipython. 'full' installs all of them. +* 1.3.9 + * Fixed issue with automated GPU-acceleration detection and flagging + * Fixed avg regression blueprint where eval function tried to call + classification evaluation + * Moved POS tagging + PCA step into non-NLP pipeline as it showed good results + in general + * improved NLP part (more and better feature engineering and preprocessing) of + blueprints for better performance + * Added Vowpal Wabbit for classification and regression and replaced stacking + ensemble in automated model exploration by Vowpal Wabbit as well + * Set random_state for train_test splits for consistency + * Fixed sklearn dependency to 0.22.0 due to six import error +* 1.0.1 + * Optimized package requirements + * Pinned LGBM requirement to version 3.1.0 due to the bug "LightGBMError: bin + size 257 cannot run on GPU #3339" +* 0.9.9 + * Enabled tune_mode parameter during class instantiation. + * Updated docstings across all functions and changed model defaults. + * Multiple bug fixes (LGBM regression accurate mode, label encoding and + permutation tests). + * Enhanced user information & better ROC_AUC display + * Added automated GPU detection for LGBM and Xgboost. + * Added functions to save and load blueprints + * architectural changes (preprocessing organized in blueprints as well) +* 0.9.4 + * First release with classification and regression blueprints. (not available + anymore) + +## References + +* Focal loss + * [Focal loss for LGBM](https://maxhalford.github.io/blog/lightgbm-focal-loss/#first-order-derivative) + * [Focal loss for LGBM multiclass](https://towardsdatascience.com/multi-class-classification-using-focal-loss-and-lightgbm-a6a6dec28872) +* Autoencoder + * [Variational Autoencoder for imbalanced data](https://github.com/lschmiddey/Autoencoder/blob/master/VAE_for_imbalanced_data.ipynb) +* Target Encoding + * [Target encoding for multiclass](https://towardsdatascience.com/target-encoding-for-multi-class-classification-c9a7bcb1a53) +* Pytorch-TabNet + * [Arik, S. O., & Pfister, T. (2019). TabNet: Attentive Interpretable Tabular Learning. arXiv preprint arXiv:1908.07442.](https://arxiv.org/pdf/1908.07442.pdf) + * [Implementing TabNet in Pytorch](https://towardsdatascience.com/implementing-tabnet-in-pytorch-fc977c383279) +* Ngboost + * [NGBoost: Natural Gradient Boosting for Probabilistic Prediction, arXiv:1910.03225](https://arxiv.org/abs/1910.03225) +* Vowpal Wabbit + * [Vowpal Wabbit Research overview](https://vowpalwabbit.org/research.html) + +## Meta + +Creator: Thomas Meißner – [LinkedIn](https://www.linkedin.com/in/thomas-mei%C3%9Fner-m-a-3808b346) + +Consultant: Gabriel Stephen Alexander – [Github](https://github.com/bitsofsteve) + +Special thanks to: Alex McKenzie - [LinkedIn](https://de.linkedin.com/in/alex-mckenzie) + +[e2eml Github repository](https://github.com/ThomasMeissnerDS/e2e_ml) + + +%package help +Summary: Development documents and examples for e2eml +Provides: python3-e2eml-doc +%description help +# e2e ML + +> An end to end solution for automl. + +Pass in your data, add some information about it and get a full pipelines in +return. Data preprocessing, feature creation, modelling and evaluation with just +a few lines of code. + +![Header image](header.png) + +## Contents + + + +* [Installation](#installation) +* [Usage example](#usage-example) +* [Linting and Pre-Commit](#linting-and-pre-commit) +* [Disclaimer](#disclaimer) +* [Development](#development) + * [Adding or Removing Dependencies](#adding-or-removing-dependencies) + * [Building and Publishing](#building-and-publishing) + * [Documentation](#documentation) + * [Pull Requests](#pull-requests) +* [Release History](#release-history) +* [References](#references) +* [Meta](#meta) + + + +## Installation + +From PyPI: + +```sh +pip install e2eml +``` + +We highly recommend to create a new virtual environment first. Then install +e2e-ml into it. In the environment also download the pretrained spacy model +with. Otherwise e2eml will do this automatically during runtime. + +e2eml can also be installed into a RAPIDS environment. For this we recommend to +create a fresh environment following [RAPIDS](https://rapids.ai/start.html) +instructions. After environment installation and activation, a special +installation is needed to not run into installation issues. + +Just run: + +```sh +pip install e2eml[rapids] +``` + +This will additionally install cupy and cython to prevent issues. Additionally +it is needed to follow Pytorch [installation instructions](https://pytorch.org/get-started/locally/). +When installing RAPIDs, Pytorch & Spacy for GPU, it is recommended to look +for supported Cuda versions in +all three. If Pytorch related parts fail on runtime, it is recommended to +reinstall a new environment and install Pytorch using pip rather than conda. + +```sh +# also spacy supports GPU acceleration +pip install -U spacy[cuda112] #cuda112 depends on your actual cuda version, see: https://spacy.io/usage +``` + +Otherwise Pytorch will fail trying to run on GPU. + +If e2eml shall be installed together with Jupyter core and ipython, please +install with: + +```sh +pip install e2eml[full] +``` + +instead. + +## Usage example + +e2e has been designed to create state-of-the-art machine learning pipelines with +a few lines of code. Basic example of usage: + +```python +import e2eml +from e2eml.classification import classification_blueprints +import pandas as pd +# import data +df = pd.read_csv("Your.csv") + +# split into a test/train & holdout set (holdout for prediction illustration here, but not required at all) +train_df = df.head(1000).copy() +holdout_df = df.tail(200).copy() # make sure +# saving the holdout dataset's target for later and delete it from holdout dataset +target = "target_column" +holdout_target = holdout_df[target].copy() +del holdout_df[target] + +# instantiate the needed blueprints class +from classification import classification_blueprints # regression bps are available with from regression import regression_blueprints +test_class = classification_blueprints.ClassificationBluePrint(datasource=train_df, + target_variable=target, + train_split_type='cross', + rapids_acceleration=True, # if installed into a conda environment with NVIDIA Rapids, this can be used to accelerate preprocessing with GPU + preferred_training_mode='auto', # Auto will automatically identify, if LGBM & Xgboost can use GPU acceleration* + tune_mode='accurate' # hyperparameter sets will be validated with 10-fold CV Set this to 'simple' for 1-fold CV + #categorical_columns=cat_columns # you can define categorical columns, otherwise e2e does this automatically + #date_columns=date_columns # you can also define date columns (expected is YYYY-MM-DD format) + ) + +""" +* +'Auto' is recommended for preferred_training_mode parameter, but with 'CPU' and 'GPU' it can also be controlled manually. +If you install Xgboost & LGBM into the same environment as GPU accelerated versions, you can set preferred_training_mode='gpu'. +This will massively improve training times and speed up SHAP feature importance for LGBM and Xgboost related tasks. +For Xgboost this should work out of the box, if installed into a RAPIDS environment. +""" +# run actual blueprint +test_class.ml_bp01_multiclass_full_processing_xgb_prob() + +""" +When choosing blueprints several options are available: + +Multiclass blueprints can handle binary and multiclass tasks: +- ml_bp00_train_test_binary_full_processing_log_reg_prob() +- ml_bp01_multiclass_full_processing_xgb_prob() +- ml_bp02_multiclass_full_processing_lgbm_prob() +- ml_bp03_multiclass_full_processing_sklearn_stacking_ensemble() +- ml_bp04_multiclass_full_processing_ngboost() +- ml_bp05_multiclass_full_processing_vowpal_wabbit +- ml_bp06_multiclass_full_processing_bert_transformer() # for NLP specifically +- ml_bp07_multiclass_full_processing_tabnet() +- ml_bp08_multiclass_full_processing_ridge() +- ml_bp09_multiclass_full_processing_catboost() +- ml_bp10_multiclass_full_processing_sgd() +- ml_bp11_multiclass_full_processing_quadratic_discriminant_analysis() +- ml_bp12_multiclass_full_processing_svm() +- ml_bp13_multiclass_full_processing_multinomial_nb() +- ml_bp14_multiclass_full_processing_lgbm_focal() +- ml_bp16_multiclass_full_processing_neural_network() # offers fully connected ANN & 1D CNN +- ml_special_binary_full_processing_boosting_blender() +- ml_special_multiclass_auto_model_exploration() +- ml_special_multiclass_full_processing_multimodel_max_voting() + +There are regression blueprints as well (in regression module): +- ml_bp10_train_test_regression_full_processing_linear_reg() +- ml_bp11_regression_full_processing_xgboost() +- ml_bp12_regressions_full_processing_lgbm() +- ml_bp13_regression_full_processing_sklearn_stacking_ensemble() +- ml_bp14_regressions_full_processing_ngboost() +- ml_bp15_regression_full_processing_vowpal_wabbit_reg() +- ml_bp16_regressions_full_processing_bert_transformer() +- ml_bp17_regression_full_processing_tabnet_reg() +- ml_bp18_regression_full_processing_ridge_reg() +- ml_bp19_regression_full_processing_elasticnet_reg() +- ml_bp20_regression_full_processing_catboost() +- ml_bp20_regression_full_processing_sgd() +- ml_bp21_regression_full_processing_ransac() +- ml_bp22_regression_full_processing_svm() +- ml_bp23_regressions_full_processing_neural_network() # offers fully connected ANN & 1D CNN +- ml_special_regression_full_processing_multimodel_avg_blender() +- ml_special_regression_auto_model_exploration() + +In the time series module we recently embedded blueprints as well: +- ml_bp100_univariate_timeseries_full_processing_auto_arima() +- ml_bp101_multivariate_timeseries_full_processing_lstm() +- ml_bp102_multivariate_timeseries_full_processing_tabnet() +- ml_bp103_multivariate_timeseries_full_processing_rnn() +- ml_bp104_univariate_timeseries_full_processing_holt_winters() + +Time series blueprints use less preprocessing on default and cannot use all options like +classification and regression models. Non-time series algorithms like TabNet are different +to their regression counterpart as cross validation is replaced by time series splits and +data scaling covers the target variable as well. + +In ensembles algorithms can be chosen via the class attribute: +test_class.special_blueprint_algorithms = {"ridge": True, + "elasticnet": False, + "xgboost": True, + "ngboost": True, + "lgbm": True, + "tabnet": False, + "vowpal_wabbit": True, + "sklearn_ensemble": True, + "catboost": False + } + +Also preprocessing steps can be selected: +test_class.blueprint_step_selection_non_nlp = { + "automatic_type_detection_casting": True, + "remove_duplicate_column_names": True, + "reset_dataframe_index": True, + "fill_infinite_values": True, + "early_numeric_only_feature_selection": True, + "delete_high_null_cols": True, + "data_binning": True, + "regex_clean_text_data": False, + "handle_target_skewness": False, + "datetime_converter": True, + "pos_tagging_pca": False, # slow with many categories + "append_text_sentiment_score": False, + "tfidf_vectorizer_to_pca": False, # slow with many categories + "tfidf_vectorizer": False, + "rare_feature_processing": True, + "cardinality_remover": True, + "categorical_column_embeddings": False, + "holistic_null_filling": True, # slow + "numeric_binarizer_pca": True, + "onehot_pca": True, + "category_encoding": True, + "fill_nulls_static": True, + "autoencoder_outlier_detection": True, + "outlier_care": True, + "delete_outliers": False, + "remove_collinearity": True, + "skewness_removal": True, + "automated_feature_transformation": False, + "random_trees_embedding": False, + "clustering_as_a_feature_dbscan": True, + "clustering_as_a_feature_kmeans_loop": True, + "clustering_as_a_feature_gaussian_mixture_loop": True, + "pca_clustering_results": True, + "svm_outlier_detection_loop": False, + "autotuned_clustering": False, + "reduce_memory_footprint": False, + "scale_data": True, + "smote": False, + "automated_feature_selection": True, + "bruteforce_random_feature_selection": False, # slow + "autoencoder_based_oversampling": False, + "synthetic_data_augmentation": False, + "final_pca_dimensionality_reduction": False, + "final_kernel_pca_dimensionality_reduction": False, + "delete_low_variance_features": False, + "shap_based_feature_selection": False, + "delete_unpredictable_training_rows": False, + "trained_tokenizer_embedding": False, + "sort_columns_alphabetically": True, + "use_tabular_gan": False, + } + +The bruteforce_random_feature_selection step is experimental. It showed promising results. The number of trials can be controlled. +This step is useful, if the model overfitted (which should happen rarely), because too many features with too little +feature importance have been considered. +like test_class.hyperparameter_tuning_rounds["bruteforce_random"] = 400 . + +Generally the class instance is a control center and gives room for plenty of customization. +Never update the class attributes like shown below. + +test_class.tabnet_settings = "batch_size": rec_batch_size, + "virtual_batch_size": virtual_batch_size, + # pred batch size? + "num_workers": 0, + "max_epochs": 1000} + +test_class.hyperparameter_tuning_rounds = { + "xgboost": 100, + "lgbm": 500, + "lgbm_focal": 50, + "tabnet": 25, + "ngboost": 25, + "sklearn_ensemble": 10, + "ridge": 500, + "elasticnet": 100, + "catboost": 25, + "sgd": 2000, + "svm": 50, + "svm_regression": 50, + "ransac": 50, + "multinomial_nb": 100, + "bruteforce_random": 400, + "synthetic_data_augmentation": 100, + "autoencoder_based_oversampling": 200, + "final_kernel_pca_dimensionality_reduction": 50, + "final_pca_dimensionality_reduction": 50, + "auto_arima": 50, + "holt_winters": 50, + } + +test_class.hyperparameter_tuning_max_runtime_secs = { + "xgboost": 2 * 60 * 60, + "lgbm": 2 * 60 * 60, + "lgbm_focal": 2 * 60 * 60, + "tabnet": 2 * 60 * 60, + "ngboost": 2 * 60 * 60, + "sklearn_ensemble": 2 * 60 * 60, + "ridge": 2 * 60 * 60, + "elasticnet": 2 * 60 * 60, + "catboost": 2 * 60 * 60, + "sgd": 2 * 60 * 60, + "svm": 2 * 60 * 60, + "svm_regression": 2 * 60 * 60, + "ransac": 2 * 60 * 60, + "multinomial_nb": 2 * 60 * 60, + "bruteforce_random": 2 * 60 * 60, + "synthetic_data_augmentation": 1 * 60 * 60, + "autoencoder_based_oversampling": 2 * 60 * 60, + "final_kernel_pca_dimensionality_reduction": 4 * 60 * 60, + "final_pca_dimensionality_reduction": 2 * 60 * 60, + "auto_arima": 2 * 60 * 60, + "holt_winters": 2 * 60 * 60, + } + +When these parameters have to updated, please overwrite the keys individually to not break the blueprints eventually. +I.e.: test_class.hyperparameter_tuning_max_runtime_secs["xgboost"] = 12*60*60 would work fine. + +Working with big data can bring all hardware to it's needs. e2eml has been tested with: +- Ryzen 5950x (16 cores CPU) +- Geforce RTX 3090 (24GB VRAM) +- 64GB RAM +e2eml has been able to process 100k rows with 200 columns approximately using these specs stable for non-blended +blueprints. Blended blueprints consume more resources as e2eml keep the trained models in memory as of now. + +For data bigger than 100k rows it is possible to limit the amount of data for various preprocessing steps: +- test_class.feature_selection_sample_size = 100000 # for feature selection +- test_class.hyperparameter_tuning_sample_size = 100000 # for model hyperparameter optimization +- test_class.brute_force_selection_sample_size = 15000 # for an experimental feature selection + +For binary classification a sample size of 100k datapoints is sufficient in most cases. +Hyperparameter tuning sample size can be much less, +depending on class imbalance. + +For multiclass we recommend to start with small samples as algorithms like Xgboost and LGBM will +easily grow in memory consumption +with growing number of classes. LGBM focal or neural network will be good starts here. + +Whenever classes are imbalanced (binary & multiclass) we recommend to use the preprocessing step +"autoencoder_based_oversampling". +""" +# After running the blueprint the pipeline is done. I can be saved with: +save_to_production(test_class, file_name='automl_instance') + +# The blueprint can be loaded with +loaded_test_class = load_for_production(file_name='automl_instance') + +# predict on new data (in this case our holdout) with loaded blueprint +loaded_test_class.ml_bp01_multiclass_full_processing_xgb_prob(holdout_df) + +# predictions can be accessed via a class attribute +print(churn_class.predicted_classes['xgboost']) +``` + +## Linting and Pre-Commit + +This project uses pre-commit to enforce style. + +To install the pre-commit hooks, first install pre-commit into the project's +virtual environment: + +```sh +pip install pre-commit +``` + +Then install the project hooks: + +```sh +pre-commit install +``` + +Now, whenever you make a commit, the linting and autoformatting will +automatically run. + +## Disclaimer + +e2e is not designed to quickly iterate over several algorithms and suggest you +the best. It is made to deliver state-of-the-art performance as ready-to-go +blueprints. e2e-ml blueprints contain: + +* preprocessing (outlier, rare feature, datetime, categorical and NLP handling) +* feature creation (binning, clustering, categorical and NLP features) +* automated feature selection +* model training (with crossfold validation) +* automated hyperparameter tuning +* model evaluation + +This comes at the cost of runtime. Depending on your data we recommend strong +hardware. + +## Development + +This project uses [poetry](https://python-poetry.org/). + +To install the project for development, run: + +```sh +poetry install +``` + +This will install all dependencies and development dependencies into a virtual +environment. + +### Adding or Removing Dependencies + +To add or remove a dependency, use `poetry add ` or +`poetry remove ` respectively. Use the `--dev` flag for development +dependencies. + +### Building and Publishing + +To build and publish the project, run + +```sh +poetry publish --build +``` + +### Documentation + +This project comes with documentation. To build the docs, run: + +```sh +cd docs +make docs +``` + +You may then browse the HTML docs at `docs/build/docs/index.html`. + +### Pull Requests + +We welcome Pull Requests! Please make a PR against the `develop` branch. + +## Release History + +* 4.14.0 + * Update Python version to support also 3.9 + * Updated import for Pandas' SettingWithCopyWarning warnings +* 4.12.00 + * Added fully connected NN for regression with quantile loss + * Fixed wrong assignment in RNN model + * Adjusted default preprocessing steps for regression tasks + * Shuffling is disabled automatically for all time_series ml_task instances + * LSTM & RNN default settings will automatically adjust to a more complex architecture, + * if more than 50 features have been detected +* 4.00.50 + * Added Autoarima & Holt winters for univariate time series predictions + * Added LSTM & RNN for uni- & multivariate time series prediction + * Autotuned NNs, LSTM and NLP transformers got an extra setting to set how + many models shall be created + * All tabular NNs (except NLPs) store predicted probabilities now + (binary classifiers will blend them when + creation of multiple modls has ben specified) + * Optimized preprocessing order +* 3.02.00 + * Refined GAN architectures + * Categorical encoding can be chosen via the cat_encoder_model attribute now + * Fixed a bug when choosing onehot encoding + * Optimized autoencoder based oversampling for regression + * Added Autoencoder based oversampling + * Optimized clustering performance +* 2.50 + * Added tabular GAN (experimental) + * Minor bug fixes +* 2.13 + * Added neural networks (ANN & soft ordered 1d-CNN) for tabular data + * Added attribute global_random_state to set state for all instances + * Added attribute shuffle_during_training to be able to disable shuffling + during model training (does not apply to all models) +* 2.12 + * Added RAPIDS support for SVM regression + * Updated Xgboost loss function for regression + * Fixed a bug in cardinality removal +* 2.11 + * Added datasets library to dependencies + * Calculation of feature importance can be controlled via class instance now. + This is helpful when using TF-IDF matrices where 10-fold permutation test + run out of memory + * Fixed loading of BERT weights from manual path + * DEESC parameters can be controlled via class attributes now + * Fixed a bug with LGBM on regression tasks + * Adjusted RAPIDS based clustering for use with RAPIDS version 21.12 + * Added RAPIDS as accelerator for feature transformation exploration + * Performance optimization for clustering & numerical binarizer + * Added random states to clustering & PCA implementations + * Improved scaling + * Stabilized TabNet for regression +* 2.10.04 + * Adjusted dependency for SHAP + * Fixed a bug where early numeric feature selection failed due to + the absence of numerical features +* 2.10.03 + * Adjusted dependencies for Pandas, Spacy, Optuna, Setuptools, Transformers +* 2.10.01 + * Added references & citations to Readme + * Added is_imbalanced flag to Timewalk + * Removed babel from dependencies & updated some of them +* 2.9.96 + * Timewalk got adjustments + * Fixed a bug where row deletion has been incompatible with Tabnet +* 2.9.95 + * SHAP based feature selection increased to 20 folds (from 10) + * less unnecessary print outs +* 2.9.93 + * Added SHAP based feature selection + * Removed Xgboost from Timewalk as default due to computational and runtime costs + * Suppress all warnings of LGBM focal during multiclass tasks +* 2.9.92 + * e2eml uses poetry + * introduction of Github actions to check linting + * bug fix of LGBM focal failing due to missing hyperparameter tuning specifications + * preparation for Readthedocs implementation +* 2.9.9 + * Added Multinomial Bayes Classifier + * Added SVM for regression + * Refined Sklearn ensembles +* 2.9.8 + * Added Quadrant Discriminent Analysis + * Added Support Vector machines + * Added Ransac regressor +* 2.9.7 + * updated Plotly dependency to 5.4.0 + * Improved Xgboost for imbalanced data +* 2.9.6 + * Added TimeTravel and timewalk: TimeTravel will save the class instance after + each preprocessing step, timewalk will automatically try different + preprocessing steps with different algorithms to find the best combination + * Updated dependencies to use newest versions of scikit-learn and + category-encoders +* 2.9.0 + * bug fixes with synthetic data augmentation for regression + * bug fix of target encoding during regression + * enhanced hyperparameter space for autoencoder based oversampling + * added final PCA dimensionality reduction as optional preprocessing step +* 2.8.1 + * autoencoder based oversampling will go through hyperprameter tuning first + (for each class individually) + * optimized TabNet performance +* 2.7.5 + * added oversampling based on variational autoencoder (experimental) +* 2.7.4 + * fixed target encoding for multiclass classification + * improved performance on multiclass tasks + * improved Xgboost & TabNet performance on binary classification + * added auto-tuned clustering as a feature +* 2.6.3 + * small bugfixes +* 2.6.1 + * Hyperparameter tuning does happen on a sample of the train data from now on + (sample size can be controlled) + * An experimental feature has been added, which tries to find unpredictable + training data rows to delete them from the training (this accelerates + training, but costs a bit model performance) + * Blueprints can be accelerated with Nvidia RAPIDS (works on clustering only f + or now) +* 2.5.9 + * optimized loss function for TabNet +* 2.5.1 + * Optimized loss function for synthetic data augmentation + * Adjusted library dependencies + * Improved target encoding +* 2.3.1 + * Changed feature selection backend from Xgboost to LGBM + * POS tagging is off on default from this version +* 2.2.9 + * bug fixes + * added an experimental feature to optimize training data with synthetic data + * added optional early feature selection (numeric only) +* 2.2.2 + * transformers can be loaded into Google Colab from Gdrive +* 2.1.2 + * Improved TFIDF vectorizer performance & non transformer NLP applications + * Improved POS tagging stability +* 2.1.1 + * Completely overworked preprocessing setup (changed API). Preprocessing + blueprints can be customized through a class attribute now + * Completely overworked special multimodel blueprints. The paricipating + algorithms can be customized through a class attribute now + * Improved NULL handling & regression performance + * Added Catboost & Elasticnet + * Updated Readme + * First unittests + * Added Stochastic Gradient classifier & regressor +* 1.8.2 + * Added Ridge classifier and regression as new blueprints +* 1.8.1 + * Added another layer of feature selection +* 1.8.0 + * Transformer padding length will be max text length + 20% instead of static + 300 + * Transformers use AutoModelForSequenceClassification instead of hardcoded + transformers now + * Hyperparameter tuning rounds and timeout can be controlled globally via + class attribute now +* 1.7.8 + * Instead of a global probability threshold, e2eml stores threshold for each + tested model + * Deprecated binary boosting blender due to lack of performance + * Added filling of inf values +* 1.7.3 + * Improved preprocessing + * Improved regression performance + * Deprecated regression boosting blender and replaced my multi + model/architecture blender + * Transformers can optionally discard worst models, but will keep all 5 by + default + * e2eml should be installable on Amazon Sagemaker now +* 1.7.0 + * Added TabNet classifier and regressor with automated hyperparameter + optimization +* 1.6.5 + * improvements of NLP transformers +* 1.5.8 + * Fixes bug around preprocessing_type='nlp' + * replaced pickle with dill for saving and loading objects +* 1.5.3 + * Added transformer blueprints for NLP classification and regression + * renamed Vowpal Wabbit blueprint to fit into blueprint naming convention + * Created "extras" options for library installation: 'rapids' installs extras, + so e2eml can be installed into into a rapids environment while 'jupyter' + adds jupyter core and ipython. 'full' installs all of them. +* 1.3.9 + * Fixed issue with automated GPU-acceleration detection and flagging + * Fixed avg regression blueprint where eval function tried to call + classification evaluation + * Moved POS tagging + PCA step into non-NLP pipeline as it showed good results + in general + * improved NLP part (more and better feature engineering and preprocessing) of + blueprints for better performance + * Added Vowpal Wabbit for classification and regression and replaced stacking + ensemble in automated model exploration by Vowpal Wabbit as well + * Set random_state for train_test splits for consistency + * Fixed sklearn dependency to 0.22.0 due to six import error +* 1.0.1 + * Optimized package requirements + * Pinned LGBM requirement to version 3.1.0 due to the bug "LightGBMError: bin + size 257 cannot run on GPU #3339" +* 0.9.9 + * Enabled tune_mode parameter during class instantiation. + * Updated docstings across all functions and changed model defaults. + * Multiple bug fixes (LGBM regression accurate mode, label encoding and + permutation tests). + * Enhanced user information & better ROC_AUC display + * Added automated GPU detection for LGBM and Xgboost. + * Added functions to save and load blueprints + * architectural changes (preprocessing organized in blueprints as well) +* 0.9.4 + * First release with classification and regression blueprints. (not available + anymore) + +## References + +* Focal loss + * [Focal loss for LGBM](https://maxhalford.github.io/blog/lightgbm-focal-loss/#first-order-derivative) + * [Focal loss for LGBM multiclass](https://towardsdatascience.com/multi-class-classification-using-focal-loss-and-lightgbm-a6a6dec28872) +* Autoencoder + * [Variational Autoencoder for imbalanced data](https://github.com/lschmiddey/Autoencoder/blob/master/VAE_for_imbalanced_data.ipynb) +* Target Encoding + * [Target encoding for multiclass](https://towardsdatascience.com/target-encoding-for-multi-class-classification-c9a7bcb1a53) +* Pytorch-TabNet + * [Arik, S. O., & Pfister, T. (2019). TabNet: Attentive Interpretable Tabular Learning. arXiv preprint arXiv:1908.07442.](https://arxiv.org/pdf/1908.07442.pdf) + * [Implementing TabNet in Pytorch](https://towardsdatascience.com/implementing-tabnet-in-pytorch-fc977c383279) +* Ngboost + * [NGBoost: Natural Gradient Boosting for Probabilistic Prediction, arXiv:1910.03225](https://arxiv.org/abs/1910.03225) +* Vowpal Wabbit + * [Vowpal Wabbit Research overview](https://vowpalwabbit.org/research.html) + +## Meta + +Creator: Thomas Meißner – [LinkedIn](https://www.linkedin.com/in/thomas-mei%C3%9Fner-m-a-3808b346) + +Consultant: Gabriel Stephen Alexander – [Github](https://github.com/bitsofsteve) + +Special thanks to: Alex McKenzie - [LinkedIn](https://de.linkedin.com/in/alex-mckenzie) + +[e2eml Github repository](https://github.com/ThomasMeissnerDS/e2e_ml) + + +%prep +%autosetup -n e2eml-4.14.20 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-e2eml -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Fri May 05 2023 Python_Bot - 4.14.20-1 +- Package Spec generated diff --git a/sources b/sources new file mode 100644 index 0000000..d5a3abb --- /dev/null +++ b/sources @@ -0,0 +1 @@ +96889c329c66125b9e33a1da9858a3f9 e2eml-4.14.20.tar.gz -- cgit v1.2.3