diff options
author | CoprDistGit <infra@openeuler.org> | 2023-05-31 03:14:01 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-05-31 03:14:01 +0000 |
commit | 95e14620f51ced7d4d299a68b214d6a01b183f92 (patch) | |
tree | a6c1c0e23853f88aa12e4b9f73e335c3d3bd9f79 | |
parent | 3acf606755cc639eec50667e570f263520529812 (diff) |
automatic import of python-jmi-mvm
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-jmi-mvm.spec | 1793 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 1795 insertions, 0 deletions
@@ -0,0 +1 @@ +/JMI_MVM-0.3.4.tar.gz diff --git a/python-jmi-mvm.spec b/python-jmi-mvm.spec new file mode 100644 index 0000000..da60e6d --- /dev/null +++ b/python-jmi-mvm.spec @@ -0,0 +1,1793 @@ +%global _empty_manifest_terminate_build 0 +Name: python-JMI-MVM +Version: 0.3.4 +Release: 1 +Summary: A collection of our functions and classes from bootcamp. +License: MIT License +URL: https://github.com/jirvingphd/JMI_MVM +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/2c/81/93d0dcde8641c7785f13d50de7ad1117ebf74856ad41f2790e6b31e80883/JMI_MVM-0.3.4.tar.gz +BuildArch: noarch + +Requires: python3-numpy +Requires: python3-pandas +Requires: python3-seaborn +Requires: python3-matplotlib +Requires: python3-sklearn +Requires: python3-pydotplus +Requires: python3-scipy +Requires: python3-xgboost + +%description +# JMI_MVM + +- A collection of tools created for botcmap. +- More information to be added later. + + +<h1>Table of Contents<span class="tocSkip"></span></h1> +<div class="toc"><ul class="toc-item"></ul></div> + + +```python +name = "JMI_MVM" +help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n" +#functions.py + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import matplotlib as mpl +import seaborn as sns + + +def calc_roc_auc(X_test,y_test,dtc,verbose=False): + """Tests the results of an already-fit classifer. + Takes X_test, y_test, classifer, verbose (True" print result) + Returns the AUC for the roc_curve as a %""" + y_pred = dtc.predict(X_test) + + FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred) + roc_auc = auc(FP_rate,TP_rate) + roc_auc_perc = round(roc_auc*100,3) + # Your code here + if verbose: + print(f"roc_curve's auc = {roc_auc_perc}%") + return roc_auc_perc + +def tune_params(param_name, param_values): + """Takes in param_name to tune with param_values, plots train vs test AUC's. + Returns df_results and df_style with color coded results""" + res_list = [[param_name,'train_roc_auc','test_roc_auc']] + + # Loop through all values in param_values + for value in param_values: + # Create Model, set params + dtc_temp = DecisionTreeClassifier(criterion='entropy') + params={param_name:value} + dtc_temp.set_params(**params) + + # Fit model + dtc_temp.fit(X_train, y_train) + + # Get roc_auc for training data + train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp) + # Get roc_auc for test data + test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp) + # Append value and results to res_list + res_list.append([value,train_roc_auc,test_res_roc_auc]) + + # Turn results into df_results (basically same as using list2df) + df_results = pd.DataFrame(res_list[1:],columns=res_list[0]) + df_results.set_index(param_name,inplace=True) + + # Plot df_results + df_results.plot() + + # Color-coded dataframe s + import seaborn as sns + cm = sns.light_palette("green", as_cmap=True) + df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max()) + + return df_results, df_syle + + +# MULTIPLOT +from string import ascii_letters +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + + +def multiplot(df): + """Plots results from df.corr() in a correlation heat map for multicollinearity. + Returns fig, ax objects""" + sns.set(style="white") + + # Compute the correlation matrix + corr = df.corr() + + # Generate a mask for the upper triangle + mask = np.zeros_like(corr, dtype=np.bool) + mask[np.triu_indices_from(mask)] = True + + # Set up the matplotlib figure + f, ax = plt.subplots(figsize=(16, 16)) + + # Generate a custom diverging colormap + cmap = sns.diverging_palette(220, 10, as_cmap=True) + + # Draw the heatmap with the mask and correct aspect ratio + sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0, + + square=True, linewidths=.5, cbar_kws={"shrink": .5}) + return f, ax + + + +# Plots histogram and scatter (vs price) side by side +# Plots histogram and scatter (vs price) side by side +def plot_hist_scat_sns(df, target='index'): + """Plots seaborne distplots and regplots for columns im datamframe vs target. + + Parameters: + df (DataFrame): DataFrame.describe() columns will be used. + target = name of column containing target variable.assume first coluumn. + + Returns: + Figures for each column vs target with 2 subplots. + """ + import matplotlib.ticker as mtick + import matplotlib.pyplot as plt + import seaborn as sns + + with plt.style.context(('dark_background')): + ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------## + + +# plt.style.use('dark_background') + figsize=(9,7) + + # Axis Label fonts + fontTitle = {'fontsize': 14, + 'fontweight': 'bold', + 'fontfamily':'serif'} + + fontAxis = {'fontsize': 12, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontTicks = {'fontsize': 8, + 'fontweight':'medium', + 'fontfamily':'serif'} + + # Formatting dollar sign labels + fmtPrice = '${x:,.0f}' + tickPrice = mtick.StrMethodFormatter(fmtPrice) + + + ### PLOTTING ----------------------------- ------------------------ ## + + # Loop through dataframe to plot + for column in df.describe(): +# print(f'\nCurrent column: {column}') + + # Create figure with subplots for current column + fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2) + + ## SUBPLOT 1 --------------------------------------------------## + i,j = 0,0 + ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) + + # Define graphing keyword dictionaries for distplot (Subplot 1) + hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'} + kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"} + + # Plot distplot on ax[i,j] using hist_kws and kde_kws + sns.distplot(df[column], norm_hist=True, kde=True, + hist_kws = hist_kws, kde_kws = kde_kws, + label=column+' histogram', ax=ax[i,j]) + + + # Set x axis label + ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) + + # Get x-ticks, rotate labels, and return + xticklab1 = ax[i,j].get_xticklabels(which = 'both') + ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0) + ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) + + + # Set y-label + ax[i,j].set_ylabel('Density',fontdict=fontAxis) + yticklab1=ax[i,j].get_yticklabels(which='both') + ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks) + ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) + + + # Set y-grid + ax[i, j].set_axisbelow(True) + ax[i, j].grid(axis='y',ls='--') + + + + + ## SUBPLOT 2-------------------------------------------------- ## + i,j = 0,1 + ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) + + # Define the kwd dictionaries for scatter and regression line (subplot 2) + line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"} + scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'} + + # Plot regplot on ax[i,j] using line_kws and scatter_kws + sns.regplot(df[column], df[target], + line_kws = line_kws, + scatter_kws = scatter_kws, + ax=ax[i,j]) + + # Set x-axis label + ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) + + # Get x ticks, rotate labels, and return + xticklab2=ax[i,j].get_xticklabels(which='both') + ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0) + ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) + + # Set y-axis label + ax[i,j].set_ylabel(target,fontdict=fontAxis) + + # Get, set, and format y-axis Price labels + yticklab = ax[i,j].get_yticklabels() + ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks) + ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) + + # ax[i,j].get_yaxis().set_major_formatter(tickPrice) + + # Set y-grid + ax[i, j].set_axisbelow(True) + ax[i, j].grid(axis='y',ls='--') + + ## ---------- Final layout adjustments ----------- ## + # Deleted unused subplots + fig.delaxes(ax[1,1]) + fig.delaxes(ax[1,0]) + + # Optimizing spatial layout + fig.tight_layout() + figtitle=column+'_dist_regr_plots.png' +# plt.savefig(figtitle) + return + +# Tukey's method using IQR to eliminate +def detect_outliers(df, n, features): + """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe. + Parameters: + df (DataFrame): DataFrane containing columns of features + n: default is 0, multiple outlier cutoff + + Returns: + Index of outliers for .loc + + Examples: + Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value + df.loc[Outliers_to_drop] # Show the outliers rows + data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) +""" + +# Drop outliers + + outlier_indices = [] + # iterate over features(columns) + for col in features: + + # 1st quartile (25%) + Q1 = np.percentile(df[col], 25) + # 3rd quartile (75%) + Q3 = np.percentile(df[col],75) + + # Interquartile range (IQR) + IQR = Q3 - Q1 + # outlier step + outlier_step = 1.5 * IQR + + # Determine a list of indices of outliers for feature col + outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index + + # append the found outlier indices for col to the list of outlier indices + outlier_indices.extend(outlier_list_col) + + # select observations containing more than 2 outliers + outlier_indices = Counter(outlier_indices) + multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) + return multiple_outliers + + +# describe_outliers -- calls detect_outliers +def describe_outliers(df): + """ Returns a new_df of outliers, and % outliers each col using detect_outliers. + """ + out_count = 0 + new_df = pd.DataFrame(columns=['total_outliers', 'percent_total']) + for col in df.columns: + outies = detect_outliers(df[col]) + out_count += len(outies) + new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)] + new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])] + return new_df + + +#### Cohen's d +def Cohen_d(group1, group2): + '''Compute Cohen's d. + # group1: Series or NumPy array + # group2: Series or NumPy array + # returns a floating point number + ''' + diff = group1.mean() - group2.mean() + + n1, n2 = len(group1), len(group2) + var1 = group1.var() + var2 = group2.var() + + # Calculate the pooled threshold as shown earlier + pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) + + # Calculate Cohen's d statistic + d = diff / np.sqrt(pooled_var) + + return d + + +def plot_pdfs(cohen_d=2): + """Plot PDFs for distributions that differ by some number of stds. + + cohen_d: number of standard deviations between the means + """ + group1 = scipy.stats.norm(0, 1) + group2 = scipy.stats.norm(cohen_d, 1) + xs, ys = evaluate_PDF(group1) + pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7) + + xs, ys = evaluate_PDF(group2) + pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7) + + o, s = overlap_superiority(group1, group2) + print('overlap', o) + print('superiority', s) + +def list2df(list):#, sort_values='index'): + """ Take in a list where row[0] = column_names and outputs a dataframe. + + Keyword arguments: + set_index -- df.set_index(set_index) + sortby -- df.sorted() + """ + + df_list = pd.DataFrame(list[1:],columns=list[0]) +# df_list = df_list[1:] + + return df_list + +def df_drop_regex(DF, regex_list): + '''Use a list of regex to remove columns names. Returns new df. + + Parameters: + DF -- input dataframe to remove columns from. + regex_list -- list of string patterns or regexp to remove. + + Returns: + df_cut -- input df without the dropped columns. + ''' + df_cut = DF.copy() + + for r in regex_list: + + df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))] + print(f'Removed {r}\n') + + return df_cut + + + +####### MIKE's PLOTTING +# plotting order totals per month in violin plots + +def make_violinplot(x,y, title=None, hue=None, ticklabels=None): + + '''Plots a violin plot with horizontal mean line, inner stick lines''' + + plt.style.use('dark_background') + fig,ax =plt.subplots(figsize=(12,10)) + + + sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True, + saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title) + + ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow') + ax.set_xticklabels(ticklabels) + + plt.legend() + plt.show() + x= df_year_orders['month'] + y= df_year_orders['order_total'] + title = 'Order totals per month with or without discounts' + hue=df_year_orders['Discount']>0 + + +### Example usage +# #First, declare variables to be plotted +# x = df_year_orders['month'] +# y = df_year_orders['order_total'] +# ticks = [v for v in month_dict.values()] +# title = 'Order totals per month with or without discounts' +# hue = df_year_orders['Discount']>0 + +### Then call function +# make_violinplot(x,y,title,hue, ticks), + + +########### +def plot_wide_kde_thin_bar(series1,sname1, series2, sname2): + '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.''' + + ## ADDING add_gridspec usage + import pandas as pd + import numpy as np + from scipy.stats import sem + + import matplotlib.pyplot as plt + import matplotlib as mpl + import matplotlib.ticker as ticker + + import seaborn as sns + + from matplotlib import rcParams + from matplotlib import rc + rcParams['font.family'] = 'serif' + + + + + # Plot distributions of discounted vs full price groups + plt.style.use('default') + # with plt.style.context(('tableau-colorblind10')): + with plt.style.context(('seaborn-notebook')): + + + + ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ## + # Axis Label fonts + fontSuptitle ={'fontsize': 22, + 'fontweight': 'bold', + 'fontfamily':'serif'} + + fontTitle = {'fontsize': 10, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontAxis = {'fontsize': 10, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontTicks = {'fontsize': 8, + 'fontweight':'medium', + 'fontfamily':'serif'} + + + ## --------- CREATE FIG BASED ON GRIDSPEC --------- ## + + plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle) + + # Create fig object and declare figsize + fig = plt.figure(constrained_layout=True, figsize=(8,3)) + + + # Define gridspec to create grid coordinates + gs = fig.add_gridspec(nrows=1,ncols=10) + + # Assign grid space to ax with add_subplot + ax0 = fig.add_subplot(gs[0,0:7]) + ax1 = fig.add_subplot(gs[0,7:10]) + + #Combine into 1 list + ax = [ax0,ax1] + + ### ------------------ SUBPLOT 1 ------------------ ### + + ## --------- Defining series1 and 2 for subplot 1------- ## + ax[0].set_title('Histogram + KDE',fontdict=fontTitle) + + # Group 1: data, label, hist_kws and kde_kws + plotS1 = {'data': series1, 'label': sname1.title(), + + 'hist_kws' : + {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5}, + + 'kde_kws': + {'color':'gray', 'linestyle': '--', 'linewidth':2, + 'label':'kde'}} + + # Group 2: data, label, hist_kws and kde_kws + plotS2 = {'data': series2, + 'label': sname2.title(), + + 'hist_kws' : + {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5}, + + + 'kde_kws': + {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}} + + # plot group 1 + sns.distplot(plotS1['data'], label=plotS1['label'], + + hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'], + + ax=ax[0]) + + + # plot group 2 + sns.distplot(plotS2['data'], label=plotS2['label'], + + hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'], + + ax=ax[0]) + + + ax[0].set_xlabel(series1.name, fontdict=fontAxis) + ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis) + + ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize']) + ax[0].legend() + + + ### ------------------ SUBPLOT 2 ------------------ ### + + # Import scipy for error bars + from scipy.stats import sem + + # Declare x y group labels(x) and bar heights(y) + x = [plotS1['label'], plotS2['label']] + y = [np.mean(plotS1['data']), np.mean(plotS2['data'])] + + yerr = [sem(plotS1['data']), sem(plotS2['data'])] + err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1} + + # Create the bar plot + ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6) + + + # Customize subplot 2 + ax[1].set_title('Average Quantities Sold',fontdict=fontTitle) + ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis) + ax[1].set_xlabel('') + + ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize']) + ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) + + ax1=ax[1] + test = ax1.get_xticklabels() + labels = [x.get_text() for x in test] + ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center') + +# xlab = [x.get_text() for x in xlablist] +# ax[1].set_xticklabels(xlab,rotation=45) + +# fig.savefig('H1_EDA_using_gridspec.png') +# plt.tight_layout() + # print(f') + plt.show() + + return fig,ax + +``` + + +```python + +``` + + + + +%package -n python3-JMI-MVM +Summary: A collection of our functions and classes from bootcamp. +Provides: python-JMI-MVM +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-JMI-MVM +# JMI_MVM + +- A collection of tools created for botcmap. +- More information to be added later. + + +<h1>Table of Contents<span class="tocSkip"></span></h1> +<div class="toc"><ul class="toc-item"></ul></div> + + +```python +name = "JMI_MVM" +help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n" +#functions.py + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import matplotlib as mpl +import seaborn as sns + + +def calc_roc_auc(X_test,y_test,dtc,verbose=False): + """Tests the results of an already-fit classifer. + Takes X_test, y_test, classifer, verbose (True" print result) + Returns the AUC for the roc_curve as a %""" + y_pred = dtc.predict(X_test) + + FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred) + roc_auc = auc(FP_rate,TP_rate) + roc_auc_perc = round(roc_auc*100,3) + # Your code here + if verbose: + print(f"roc_curve's auc = {roc_auc_perc}%") + return roc_auc_perc + +def tune_params(param_name, param_values): + """Takes in param_name to tune with param_values, plots train vs test AUC's. + Returns df_results and df_style with color coded results""" + res_list = [[param_name,'train_roc_auc','test_roc_auc']] + + # Loop through all values in param_values + for value in param_values: + # Create Model, set params + dtc_temp = DecisionTreeClassifier(criterion='entropy') + params={param_name:value} + dtc_temp.set_params(**params) + + # Fit model + dtc_temp.fit(X_train, y_train) + + # Get roc_auc for training data + train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp) + # Get roc_auc for test data + test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp) + # Append value and results to res_list + res_list.append([value,train_roc_auc,test_res_roc_auc]) + + # Turn results into df_results (basically same as using list2df) + df_results = pd.DataFrame(res_list[1:],columns=res_list[0]) + df_results.set_index(param_name,inplace=True) + + # Plot df_results + df_results.plot() + + # Color-coded dataframe s + import seaborn as sns + cm = sns.light_palette("green", as_cmap=True) + df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max()) + + return df_results, df_syle + + +# MULTIPLOT +from string import ascii_letters +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + + +def multiplot(df): + """Plots results from df.corr() in a correlation heat map for multicollinearity. + Returns fig, ax objects""" + sns.set(style="white") + + # Compute the correlation matrix + corr = df.corr() + + # Generate a mask for the upper triangle + mask = np.zeros_like(corr, dtype=np.bool) + mask[np.triu_indices_from(mask)] = True + + # Set up the matplotlib figure + f, ax = plt.subplots(figsize=(16, 16)) + + # Generate a custom diverging colormap + cmap = sns.diverging_palette(220, 10, as_cmap=True) + + # Draw the heatmap with the mask and correct aspect ratio + sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0, + + square=True, linewidths=.5, cbar_kws={"shrink": .5}) + return f, ax + + + +# Plots histogram and scatter (vs price) side by side +# Plots histogram and scatter (vs price) side by side +def plot_hist_scat_sns(df, target='index'): + """Plots seaborne distplots and regplots for columns im datamframe vs target. + + Parameters: + df (DataFrame): DataFrame.describe() columns will be used. + target = name of column containing target variable.assume first coluumn. + + Returns: + Figures for each column vs target with 2 subplots. + """ + import matplotlib.ticker as mtick + import matplotlib.pyplot as plt + import seaborn as sns + + with plt.style.context(('dark_background')): + ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------## + + +# plt.style.use('dark_background') + figsize=(9,7) + + # Axis Label fonts + fontTitle = {'fontsize': 14, + 'fontweight': 'bold', + 'fontfamily':'serif'} + + fontAxis = {'fontsize': 12, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontTicks = {'fontsize': 8, + 'fontweight':'medium', + 'fontfamily':'serif'} + + # Formatting dollar sign labels + fmtPrice = '${x:,.0f}' + tickPrice = mtick.StrMethodFormatter(fmtPrice) + + + ### PLOTTING ----------------------------- ------------------------ ## + + # Loop through dataframe to plot + for column in df.describe(): +# print(f'\nCurrent column: {column}') + + # Create figure with subplots for current column + fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2) + + ## SUBPLOT 1 --------------------------------------------------## + i,j = 0,0 + ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) + + # Define graphing keyword dictionaries for distplot (Subplot 1) + hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'} + kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"} + + # Plot distplot on ax[i,j] using hist_kws and kde_kws + sns.distplot(df[column], norm_hist=True, kde=True, + hist_kws = hist_kws, kde_kws = kde_kws, + label=column+' histogram', ax=ax[i,j]) + + + # Set x axis label + ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) + + # Get x-ticks, rotate labels, and return + xticklab1 = ax[i,j].get_xticklabels(which = 'both') + ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0) + ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) + + + # Set y-label + ax[i,j].set_ylabel('Density',fontdict=fontAxis) + yticklab1=ax[i,j].get_yticklabels(which='both') + ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks) + ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) + + + # Set y-grid + ax[i, j].set_axisbelow(True) + ax[i, j].grid(axis='y',ls='--') + + + + + ## SUBPLOT 2-------------------------------------------------- ## + i,j = 0,1 + ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) + + # Define the kwd dictionaries for scatter and regression line (subplot 2) + line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"} + scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'} + + # Plot regplot on ax[i,j] using line_kws and scatter_kws + sns.regplot(df[column], df[target], + line_kws = line_kws, + scatter_kws = scatter_kws, + ax=ax[i,j]) + + # Set x-axis label + ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) + + # Get x ticks, rotate labels, and return + xticklab2=ax[i,j].get_xticklabels(which='both') + ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0) + ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) + + # Set y-axis label + ax[i,j].set_ylabel(target,fontdict=fontAxis) + + # Get, set, and format y-axis Price labels + yticklab = ax[i,j].get_yticklabels() + ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks) + ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) + + # ax[i,j].get_yaxis().set_major_formatter(tickPrice) + + # Set y-grid + ax[i, j].set_axisbelow(True) + ax[i, j].grid(axis='y',ls='--') + + ## ---------- Final layout adjustments ----------- ## + # Deleted unused subplots + fig.delaxes(ax[1,1]) + fig.delaxes(ax[1,0]) + + # Optimizing spatial layout + fig.tight_layout() + figtitle=column+'_dist_regr_plots.png' +# plt.savefig(figtitle) + return + +# Tukey's method using IQR to eliminate +def detect_outliers(df, n, features): + """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe. + Parameters: + df (DataFrame): DataFrane containing columns of features + n: default is 0, multiple outlier cutoff + + Returns: + Index of outliers for .loc + + Examples: + Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value + df.loc[Outliers_to_drop] # Show the outliers rows + data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) +""" + +# Drop outliers + + outlier_indices = [] + # iterate over features(columns) + for col in features: + + # 1st quartile (25%) + Q1 = np.percentile(df[col], 25) + # 3rd quartile (75%) + Q3 = np.percentile(df[col],75) + + # Interquartile range (IQR) + IQR = Q3 - Q1 + # outlier step + outlier_step = 1.5 * IQR + + # Determine a list of indices of outliers for feature col + outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index + + # append the found outlier indices for col to the list of outlier indices + outlier_indices.extend(outlier_list_col) + + # select observations containing more than 2 outliers + outlier_indices = Counter(outlier_indices) + multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) + return multiple_outliers + + +# describe_outliers -- calls detect_outliers +def describe_outliers(df): + """ Returns a new_df of outliers, and % outliers each col using detect_outliers. + """ + out_count = 0 + new_df = pd.DataFrame(columns=['total_outliers', 'percent_total']) + for col in df.columns: + outies = detect_outliers(df[col]) + out_count += len(outies) + new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)] + new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])] + return new_df + + +#### Cohen's d +def Cohen_d(group1, group2): + '''Compute Cohen's d. + # group1: Series or NumPy array + # group2: Series or NumPy array + # returns a floating point number + ''' + diff = group1.mean() - group2.mean() + + n1, n2 = len(group1), len(group2) + var1 = group1.var() + var2 = group2.var() + + # Calculate the pooled threshold as shown earlier + pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) + + # Calculate Cohen's d statistic + d = diff / np.sqrt(pooled_var) + + return d + + +def plot_pdfs(cohen_d=2): + """Plot PDFs for distributions that differ by some number of stds. + + cohen_d: number of standard deviations between the means + """ + group1 = scipy.stats.norm(0, 1) + group2 = scipy.stats.norm(cohen_d, 1) + xs, ys = evaluate_PDF(group1) + pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7) + + xs, ys = evaluate_PDF(group2) + pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7) + + o, s = overlap_superiority(group1, group2) + print('overlap', o) + print('superiority', s) + +def list2df(list):#, sort_values='index'): + """ Take in a list where row[0] = column_names and outputs a dataframe. + + Keyword arguments: + set_index -- df.set_index(set_index) + sortby -- df.sorted() + """ + + df_list = pd.DataFrame(list[1:],columns=list[0]) +# df_list = df_list[1:] + + return df_list + +def df_drop_regex(DF, regex_list): + '''Use a list of regex to remove columns names. Returns new df. + + Parameters: + DF -- input dataframe to remove columns from. + regex_list -- list of string patterns or regexp to remove. + + Returns: + df_cut -- input df without the dropped columns. + ''' + df_cut = DF.copy() + + for r in regex_list: + + df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))] + print(f'Removed {r}\n') + + return df_cut + + + +####### MIKE's PLOTTING +# plotting order totals per month in violin plots + +def make_violinplot(x,y, title=None, hue=None, ticklabels=None): + + '''Plots a violin plot with horizontal mean line, inner stick lines''' + + plt.style.use('dark_background') + fig,ax =plt.subplots(figsize=(12,10)) + + + sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True, + saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title) + + ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow') + ax.set_xticklabels(ticklabels) + + plt.legend() + plt.show() + x= df_year_orders['month'] + y= df_year_orders['order_total'] + title = 'Order totals per month with or without discounts' + hue=df_year_orders['Discount']>0 + + +### Example usage +# #First, declare variables to be plotted +# x = df_year_orders['month'] +# y = df_year_orders['order_total'] +# ticks = [v for v in month_dict.values()] +# title = 'Order totals per month with or without discounts' +# hue = df_year_orders['Discount']>0 + +### Then call function +# make_violinplot(x,y,title,hue, ticks), + + +########### +def plot_wide_kde_thin_bar(series1,sname1, series2, sname2): + '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.''' + + ## ADDING add_gridspec usage + import pandas as pd + import numpy as np + from scipy.stats import sem + + import matplotlib.pyplot as plt + import matplotlib as mpl + import matplotlib.ticker as ticker + + import seaborn as sns + + from matplotlib import rcParams + from matplotlib import rc + rcParams['font.family'] = 'serif' + + + + + # Plot distributions of discounted vs full price groups + plt.style.use('default') + # with plt.style.context(('tableau-colorblind10')): + with plt.style.context(('seaborn-notebook')): + + + + ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ## + # Axis Label fonts + fontSuptitle ={'fontsize': 22, + 'fontweight': 'bold', + 'fontfamily':'serif'} + + fontTitle = {'fontsize': 10, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontAxis = {'fontsize': 10, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontTicks = {'fontsize': 8, + 'fontweight':'medium', + 'fontfamily':'serif'} + + + ## --------- CREATE FIG BASED ON GRIDSPEC --------- ## + + plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle) + + # Create fig object and declare figsize + fig = plt.figure(constrained_layout=True, figsize=(8,3)) + + + # Define gridspec to create grid coordinates + gs = fig.add_gridspec(nrows=1,ncols=10) + + # Assign grid space to ax with add_subplot + ax0 = fig.add_subplot(gs[0,0:7]) + ax1 = fig.add_subplot(gs[0,7:10]) + + #Combine into 1 list + ax = [ax0,ax1] + + ### ------------------ SUBPLOT 1 ------------------ ### + + ## --------- Defining series1 and 2 for subplot 1------- ## + ax[0].set_title('Histogram + KDE',fontdict=fontTitle) + + # Group 1: data, label, hist_kws and kde_kws + plotS1 = {'data': series1, 'label': sname1.title(), + + 'hist_kws' : + {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5}, + + 'kde_kws': + {'color':'gray', 'linestyle': '--', 'linewidth':2, + 'label':'kde'}} + + # Group 2: data, label, hist_kws and kde_kws + plotS2 = {'data': series2, + 'label': sname2.title(), + + 'hist_kws' : + {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5}, + + + 'kde_kws': + {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}} + + # plot group 1 + sns.distplot(plotS1['data'], label=plotS1['label'], + + hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'], + + ax=ax[0]) + + + # plot group 2 + sns.distplot(plotS2['data'], label=plotS2['label'], + + hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'], + + ax=ax[0]) + + + ax[0].set_xlabel(series1.name, fontdict=fontAxis) + ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis) + + ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize']) + ax[0].legend() + + + ### ------------------ SUBPLOT 2 ------------------ ### + + # Import scipy for error bars + from scipy.stats import sem + + # Declare x y group labels(x) and bar heights(y) + x = [plotS1['label'], plotS2['label']] + y = [np.mean(plotS1['data']), np.mean(plotS2['data'])] + + yerr = [sem(plotS1['data']), sem(plotS2['data'])] + err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1} + + # Create the bar plot + ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6) + + + # Customize subplot 2 + ax[1].set_title('Average Quantities Sold',fontdict=fontTitle) + ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis) + ax[1].set_xlabel('') + + ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize']) + ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) + + ax1=ax[1] + test = ax1.get_xticklabels() + labels = [x.get_text() for x in test] + ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center') + +# xlab = [x.get_text() for x in xlablist] +# ax[1].set_xticklabels(xlab,rotation=45) + +# fig.savefig('H1_EDA_using_gridspec.png') +# plt.tight_layout() + # print(f') + plt.show() + + return fig,ax + +``` + + +```python + +``` + + + + +%package help +Summary: Development documents and examples for JMI-MVM +Provides: python3-JMI-MVM-doc +%description help +# JMI_MVM + +- A collection of tools created for botcmap. +- More information to be added later. + + +<h1>Table of Contents<span class="tocSkip"></span></h1> +<div class="toc"><ul class="toc-item"></ul></div> + + +```python +name = "JMI_MVM" +help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n" +#functions.py + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import matplotlib as mpl +import seaborn as sns + + +def calc_roc_auc(X_test,y_test,dtc,verbose=False): + """Tests the results of an already-fit classifer. + Takes X_test, y_test, classifer, verbose (True" print result) + Returns the AUC for the roc_curve as a %""" + y_pred = dtc.predict(X_test) + + FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred) + roc_auc = auc(FP_rate,TP_rate) + roc_auc_perc = round(roc_auc*100,3) + # Your code here + if verbose: + print(f"roc_curve's auc = {roc_auc_perc}%") + return roc_auc_perc + +def tune_params(param_name, param_values): + """Takes in param_name to tune with param_values, plots train vs test AUC's. + Returns df_results and df_style with color coded results""" + res_list = [[param_name,'train_roc_auc','test_roc_auc']] + + # Loop through all values in param_values + for value in param_values: + # Create Model, set params + dtc_temp = DecisionTreeClassifier(criterion='entropy') + params={param_name:value} + dtc_temp.set_params(**params) + + # Fit model + dtc_temp.fit(X_train, y_train) + + # Get roc_auc for training data + train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp) + # Get roc_auc for test data + test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp) + # Append value and results to res_list + res_list.append([value,train_roc_auc,test_res_roc_auc]) + + # Turn results into df_results (basically same as using list2df) + df_results = pd.DataFrame(res_list[1:],columns=res_list[0]) + df_results.set_index(param_name,inplace=True) + + # Plot df_results + df_results.plot() + + # Color-coded dataframe s + import seaborn as sns + cm = sns.light_palette("green", as_cmap=True) + df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max()) + + return df_results, df_syle + + +# MULTIPLOT +from string import ascii_letters +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + + +def multiplot(df): + """Plots results from df.corr() in a correlation heat map for multicollinearity. + Returns fig, ax objects""" + sns.set(style="white") + + # Compute the correlation matrix + corr = df.corr() + + # Generate a mask for the upper triangle + mask = np.zeros_like(corr, dtype=np.bool) + mask[np.triu_indices_from(mask)] = True + + # Set up the matplotlib figure + f, ax = plt.subplots(figsize=(16, 16)) + + # Generate a custom diverging colormap + cmap = sns.diverging_palette(220, 10, as_cmap=True) + + # Draw the heatmap with the mask and correct aspect ratio + sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0, + + square=True, linewidths=.5, cbar_kws={"shrink": .5}) + return f, ax + + + +# Plots histogram and scatter (vs price) side by side +# Plots histogram and scatter (vs price) side by side +def plot_hist_scat_sns(df, target='index'): + """Plots seaborne distplots and regplots for columns im datamframe vs target. + + Parameters: + df (DataFrame): DataFrame.describe() columns will be used. + target = name of column containing target variable.assume first coluumn. + + Returns: + Figures for each column vs target with 2 subplots. + """ + import matplotlib.ticker as mtick + import matplotlib.pyplot as plt + import seaborn as sns + + with plt.style.context(('dark_background')): + ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------## + + +# plt.style.use('dark_background') + figsize=(9,7) + + # Axis Label fonts + fontTitle = {'fontsize': 14, + 'fontweight': 'bold', + 'fontfamily':'serif'} + + fontAxis = {'fontsize': 12, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontTicks = {'fontsize': 8, + 'fontweight':'medium', + 'fontfamily':'serif'} + + # Formatting dollar sign labels + fmtPrice = '${x:,.0f}' + tickPrice = mtick.StrMethodFormatter(fmtPrice) + + + ### PLOTTING ----------------------------- ------------------------ ## + + # Loop through dataframe to plot + for column in df.describe(): +# print(f'\nCurrent column: {column}') + + # Create figure with subplots for current column + fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2) + + ## SUBPLOT 1 --------------------------------------------------## + i,j = 0,0 + ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) + + # Define graphing keyword dictionaries for distplot (Subplot 1) + hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'} + kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"} + + # Plot distplot on ax[i,j] using hist_kws and kde_kws + sns.distplot(df[column], norm_hist=True, kde=True, + hist_kws = hist_kws, kde_kws = kde_kws, + label=column+' histogram', ax=ax[i,j]) + + + # Set x axis label + ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) + + # Get x-ticks, rotate labels, and return + xticklab1 = ax[i,j].get_xticklabels(which = 'both') + ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0) + ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) + + + # Set y-label + ax[i,j].set_ylabel('Density',fontdict=fontAxis) + yticklab1=ax[i,j].get_yticklabels(which='both') + ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks) + ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) + + + # Set y-grid + ax[i, j].set_axisbelow(True) + ax[i, j].grid(axis='y',ls='--') + + + + + ## SUBPLOT 2-------------------------------------------------- ## + i,j = 0,1 + ax[i,j].set_title(column.capitalize(),fontdict=fontTitle) + + # Define the kwd dictionaries for scatter and regression line (subplot 2) + line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"} + scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'} + + # Plot regplot on ax[i,j] using line_kws and scatter_kws + sns.regplot(df[column], df[target], + line_kws = line_kws, + scatter_kws = scatter_kws, + ax=ax[i,j]) + + # Set x-axis label + ax[i,j].set_xlabel(column.title(),fontdict=fontAxis) + + # Get x ticks, rotate labels, and return + xticklab2=ax[i,j].get_xticklabels(which='both') + ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0) + ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter()) + + # Set y-axis label + ax[i,j].set_ylabel(target,fontdict=fontAxis) + + # Get, set, and format y-axis Price labels + yticklab = ax[i,j].get_yticklabels() + ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks) + ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter()) + + # ax[i,j].get_yaxis().set_major_formatter(tickPrice) + + # Set y-grid + ax[i, j].set_axisbelow(True) + ax[i, j].grid(axis='y',ls='--') + + ## ---------- Final layout adjustments ----------- ## + # Deleted unused subplots + fig.delaxes(ax[1,1]) + fig.delaxes(ax[1,0]) + + # Optimizing spatial layout + fig.tight_layout() + figtitle=column+'_dist_regr_plots.png' +# plt.savefig(figtitle) + return + +# Tukey's method using IQR to eliminate +def detect_outliers(df, n, features): + """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe. + Parameters: + df (DataFrame): DataFrane containing columns of features + n: default is 0, multiple outlier cutoff + + Returns: + Index of outliers for .loc + + Examples: + Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value + df.loc[Outliers_to_drop] # Show the outliers rows + data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) +""" + +# Drop outliers + + outlier_indices = [] + # iterate over features(columns) + for col in features: + + # 1st quartile (25%) + Q1 = np.percentile(df[col], 25) + # 3rd quartile (75%) + Q3 = np.percentile(df[col],75) + + # Interquartile range (IQR) + IQR = Q3 - Q1 + # outlier step + outlier_step = 1.5 * IQR + + # Determine a list of indices of outliers for feature col + outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index + + # append the found outlier indices for col to the list of outlier indices + outlier_indices.extend(outlier_list_col) + + # select observations containing more than 2 outliers + outlier_indices = Counter(outlier_indices) + multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) + return multiple_outliers + + +# describe_outliers -- calls detect_outliers +def describe_outliers(df): + """ Returns a new_df of outliers, and % outliers each col using detect_outliers. + """ + out_count = 0 + new_df = pd.DataFrame(columns=['total_outliers', 'percent_total']) + for col in df.columns: + outies = detect_outliers(df[col]) + out_count += len(outies) + new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)] + new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])] + return new_df + + +#### Cohen's d +def Cohen_d(group1, group2): + '''Compute Cohen's d. + # group1: Series or NumPy array + # group2: Series or NumPy array + # returns a floating point number + ''' + diff = group1.mean() - group2.mean() + + n1, n2 = len(group1), len(group2) + var1 = group1.var() + var2 = group2.var() + + # Calculate the pooled threshold as shown earlier + pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) + + # Calculate Cohen's d statistic + d = diff / np.sqrt(pooled_var) + + return d + + +def plot_pdfs(cohen_d=2): + """Plot PDFs for distributions that differ by some number of stds. + + cohen_d: number of standard deviations between the means + """ + group1 = scipy.stats.norm(0, 1) + group2 = scipy.stats.norm(cohen_d, 1) + xs, ys = evaluate_PDF(group1) + pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7) + + xs, ys = evaluate_PDF(group2) + pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7) + + o, s = overlap_superiority(group1, group2) + print('overlap', o) + print('superiority', s) + +def list2df(list):#, sort_values='index'): + """ Take in a list where row[0] = column_names and outputs a dataframe. + + Keyword arguments: + set_index -- df.set_index(set_index) + sortby -- df.sorted() + """ + + df_list = pd.DataFrame(list[1:],columns=list[0]) +# df_list = df_list[1:] + + return df_list + +def df_drop_regex(DF, regex_list): + '''Use a list of regex to remove columns names. Returns new df. + + Parameters: + DF -- input dataframe to remove columns from. + regex_list -- list of string patterns or regexp to remove. + + Returns: + df_cut -- input df without the dropped columns. + ''' + df_cut = DF.copy() + + for r in regex_list: + + df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))] + print(f'Removed {r}\n') + + return df_cut + + + +####### MIKE's PLOTTING +# plotting order totals per month in violin plots + +def make_violinplot(x,y, title=None, hue=None, ticklabels=None): + + '''Plots a violin plot with horizontal mean line, inner stick lines''' + + plt.style.use('dark_background') + fig,ax =plt.subplots(figsize=(12,10)) + + + sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True, + saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title) + + ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow') + ax.set_xticklabels(ticklabels) + + plt.legend() + plt.show() + x= df_year_orders['month'] + y= df_year_orders['order_total'] + title = 'Order totals per month with or without discounts' + hue=df_year_orders['Discount']>0 + + +### Example usage +# #First, declare variables to be plotted +# x = df_year_orders['month'] +# y = df_year_orders['order_total'] +# ticks = [v for v in month_dict.values()] +# title = 'Order totals per month with or without discounts' +# hue = df_year_orders['Discount']>0 + +### Then call function +# make_violinplot(x,y,title,hue, ticks), + + +########### +def plot_wide_kde_thin_bar(series1,sname1, series2, sname2): + '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.''' + + ## ADDING add_gridspec usage + import pandas as pd + import numpy as np + from scipy.stats import sem + + import matplotlib.pyplot as plt + import matplotlib as mpl + import matplotlib.ticker as ticker + + import seaborn as sns + + from matplotlib import rcParams + from matplotlib import rc + rcParams['font.family'] = 'serif' + + + + + # Plot distributions of discounted vs full price groups + plt.style.use('default') + # with plt.style.context(('tableau-colorblind10')): + with plt.style.context(('seaborn-notebook')): + + + + ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ## + # Axis Label fonts + fontSuptitle ={'fontsize': 22, + 'fontweight': 'bold', + 'fontfamily':'serif'} + + fontTitle = {'fontsize': 10, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontAxis = {'fontsize': 10, + 'fontweight': 'medium', + 'fontfamily':'serif'} + + fontTicks = {'fontsize': 8, + 'fontweight':'medium', + 'fontfamily':'serif'} + + + ## --------- CREATE FIG BASED ON GRIDSPEC --------- ## + + plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle) + + # Create fig object and declare figsize + fig = plt.figure(constrained_layout=True, figsize=(8,3)) + + + # Define gridspec to create grid coordinates + gs = fig.add_gridspec(nrows=1,ncols=10) + + # Assign grid space to ax with add_subplot + ax0 = fig.add_subplot(gs[0,0:7]) + ax1 = fig.add_subplot(gs[0,7:10]) + + #Combine into 1 list + ax = [ax0,ax1] + + ### ------------------ SUBPLOT 1 ------------------ ### + + ## --------- Defining series1 and 2 for subplot 1------- ## + ax[0].set_title('Histogram + KDE',fontdict=fontTitle) + + # Group 1: data, label, hist_kws and kde_kws + plotS1 = {'data': series1, 'label': sname1.title(), + + 'hist_kws' : + {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5}, + + 'kde_kws': + {'color':'gray', 'linestyle': '--', 'linewidth':2, + 'label':'kde'}} + + # Group 2: data, label, hist_kws and kde_kws + plotS2 = {'data': series2, + 'label': sname2.title(), + + 'hist_kws' : + {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5}, + + + 'kde_kws': + {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}} + + # plot group 1 + sns.distplot(plotS1['data'], label=plotS1['label'], + + hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'], + + ax=ax[0]) + + + # plot group 2 + sns.distplot(plotS2['data'], label=plotS2['label'], + + hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'], + + ax=ax[0]) + + + ax[0].set_xlabel(series1.name, fontdict=fontAxis) + ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis) + + ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize']) + ax[0].legend() + + + ### ------------------ SUBPLOT 2 ------------------ ### + + # Import scipy for error bars + from scipy.stats import sem + + # Declare x y group labels(x) and bar heights(y) + x = [plotS1['label'], plotS2['label']] + y = [np.mean(plotS1['data']), np.mean(plotS2['data'])] + + yerr = [sem(plotS1['data']), sem(plotS2['data'])] + err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1} + + # Create the bar plot + ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6) + + + # Customize subplot 2 + ax[1].set_title('Average Quantities Sold',fontdict=fontTitle) + ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis) + ax[1].set_xlabel('') + + ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize']) + ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) + + ax1=ax[1] + test = ax1.get_xticklabels() + labels = [x.get_text() for x in test] + ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center') + +# xlab = [x.get_text() for x in xlablist] +# ax[1].set_xticklabels(xlab,rotation=45) + +# fig.savefig('H1_EDA_using_gridspec.png') +# plt.tight_layout() + # print(f') + plt.show() + + return fig,ax + +``` + + +```python + +``` + + + + +%prep +%autosetup -n JMI-MVM-0.3.4 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-JMI-MVM -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 31 2023 Python_Bot <Python_Bot@openeuler.org> - 0.3.4-1 +- Package Spec generated @@ -0,0 +1 @@ +add6261816be7d1396c08729e51554ca JMI_MVM-0.3.4.tar.gz |