summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2023-05-31 03:14:01 +0000
committerCoprDistGit <infra@openeuler.org>2023-05-31 03:14:01 +0000
commit95e14620f51ced7d4d299a68b214d6a01b183f92 (patch)
treea6c1c0e23853f88aa12e4b9f73e335c3d3bd9f79
parent3acf606755cc639eec50667e570f263520529812 (diff)
automatic import of python-jmi-mvm
-rw-r--r--.gitignore1
-rw-r--r--python-jmi-mvm.spec1793
-rw-r--r--sources1
3 files changed, 1795 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..fdc752e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/JMI_MVM-0.3.4.tar.gz
diff --git a/python-jmi-mvm.spec b/python-jmi-mvm.spec
new file mode 100644
index 0000000..da60e6d
--- /dev/null
+++ b/python-jmi-mvm.spec
@@ -0,0 +1,1793 @@
+%global _empty_manifest_terminate_build 0
+Name: python-JMI-MVM
+Version: 0.3.4
+Release: 1
+Summary: A collection of our functions and classes from bootcamp.
+License: MIT License
+URL: https://github.com/jirvingphd/JMI_MVM
+Source0: https://mirrors.nju.edu.cn/pypi/web/packages/2c/81/93d0dcde8641c7785f13d50de7ad1117ebf74856ad41f2790e6b31e80883/JMI_MVM-0.3.4.tar.gz
+BuildArch: noarch
+
+Requires: python3-numpy
+Requires: python3-pandas
+Requires: python3-seaborn
+Requires: python3-matplotlib
+Requires: python3-sklearn
+Requires: python3-pydotplus
+Requires: python3-scipy
+Requires: python3-xgboost
+
+%description
+# JMI_MVM
+
+- A collection of tools created for botcmap.
+- More information to be added later.
+
+
+<h1>Table of Contents<span class="tocSkip"></span></h1>
+<div class="toc"><ul class="toc-item"></ul></div>
+
+
+```python
+name = "JMI_MVM"
+help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
+#functions.py
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+
+
+def calc_roc_auc(X_test,y_test,dtc,verbose=False):
+ """Tests the results of an already-fit classifer.
+ Takes X_test, y_test, classifer, verbose (True" print result)
+ Returns the AUC for the roc_curve as a %"""
+ y_pred = dtc.predict(X_test)
+
+ FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
+ roc_auc = auc(FP_rate,TP_rate)
+ roc_auc_perc = round(roc_auc*100,3)
+ # Your code here
+ if verbose:
+ print(f"roc_curve's auc = {roc_auc_perc}%")
+ return roc_auc_perc
+
+def tune_params(param_name, param_values):
+ """Takes in param_name to tune with param_values, plots train vs test AUC's.
+ Returns df_results and df_style with color coded results"""
+ res_list = [[param_name,'train_roc_auc','test_roc_auc']]
+
+ # Loop through all values in param_values
+ for value in param_values:
+ # Create Model, set params
+ dtc_temp = DecisionTreeClassifier(criterion='entropy')
+ params={param_name:value}
+ dtc_temp.set_params(**params)
+
+ # Fit model
+ dtc_temp.fit(X_train, y_train)
+
+ # Get roc_auc for training data
+ train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
+ # Get roc_auc for test data
+ test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
+ # Append value and results to res_list
+ res_list.append([value,train_roc_auc,test_res_roc_auc])
+
+ # Turn results into df_results (basically same as using list2df)
+ df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
+ df_results.set_index(param_name,inplace=True)
+
+ # Plot df_results
+ df_results.plot()
+
+ # Color-coded dataframe s
+ import seaborn as sns
+ cm = sns.light_palette("green", as_cmap=True)
+ df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())
+
+ return df_results, df_syle
+
+
+# MULTIPLOT
+from string import ascii_letters
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def multiplot(df):
+ """Plots results from df.corr() in a correlation heat map for multicollinearity.
+ Returns fig, ax objects"""
+ sns.set(style="white")
+
+ # Compute the correlation matrix
+ corr = df.corr()
+
+ # Generate a mask for the upper triangle
+ mask = np.zeros_like(corr, dtype=np.bool)
+ mask[np.triu_indices_from(mask)] = True
+
+ # Set up the matplotlib figure
+ f, ax = plt.subplots(figsize=(16, 16))
+
+ # Generate a custom diverging colormap
+ cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+ # Draw the heatmap with the mask and correct aspect ratio
+ sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
+
+ square=True, linewidths=.5, cbar_kws={"shrink": .5})
+ return f, ax
+
+
+
+# Plots histogram and scatter (vs price) side by side
+# Plots histogram and scatter (vs price) side by side
+def plot_hist_scat_sns(df, target='index'):
+ """Plots seaborne distplots and regplots for columns im datamframe vs target.
+
+ Parameters:
+ df (DataFrame): DataFrame.describe() columns will be used.
+ target = name of column containing target variable.assume first coluumn.
+
+ Returns:
+ Figures for each column vs target with 2 subplots.
+ """
+ import matplotlib.ticker as mtick
+ import matplotlib.pyplot as plt
+ import seaborn as sns
+
+ with plt.style.context(('dark_background')):
+ ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------##
+
+
+# plt.style.use('dark_background')
+ figsize=(9,7)
+
+ # Axis Label fonts
+ fontTitle = {'fontsize': 14,
+ 'fontweight': 'bold',
+ 'fontfamily':'serif'}
+
+ fontAxis = {'fontsize': 12,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontTicks = {'fontsize': 8,
+ 'fontweight':'medium',
+ 'fontfamily':'serif'}
+
+ # Formatting dollar sign labels
+ fmtPrice = '${x:,.0f}'
+ tickPrice = mtick.StrMethodFormatter(fmtPrice)
+
+
+ ### PLOTTING ----------------------------- ------------------------ ##
+
+ # Loop through dataframe to plot
+ for column in df.describe():
+# print(f'\nCurrent column: {column}')
+
+ # Create figure with subplots for current column
+ fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
+
+ ## SUBPLOT 1 --------------------------------------------------##
+ i,j = 0,0
+ ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+ # Define graphing keyword dictionaries for distplot (Subplot 1)
+ hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
+ kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
+
+ # Plot distplot on ax[i,j] using hist_kws and kde_kws
+ sns.distplot(df[column], norm_hist=True, kde=True,
+ hist_kws = hist_kws, kde_kws = kde_kws,
+ label=column+' histogram', ax=ax[i,j])
+
+
+ # Set x axis label
+ ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+ # Get x-ticks, rotate labels, and return
+ xticklab1 = ax[i,j].get_xticklabels(which = 'both')
+ ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
+ ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+ # Set y-label
+ ax[i,j].set_ylabel('Density',fontdict=fontAxis)
+ yticklab1=ax[i,j].get_yticklabels(which='both')
+ ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
+ ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+ # Set y-grid
+ ax[i, j].set_axisbelow(True)
+ ax[i, j].grid(axis='y',ls='--')
+
+
+
+
+ ## SUBPLOT 2-------------------------------------------------- ##
+ i,j = 0,1
+ ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+ # Define the kwd dictionaries for scatter and regression line (subplot 2)
+ line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
+ scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
+
+ # Plot regplot on ax[i,j] using line_kws and scatter_kws
+ sns.regplot(df[column], df[target],
+ line_kws = line_kws,
+ scatter_kws = scatter_kws,
+ ax=ax[i,j])
+
+ # Set x-axis label
+ ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+ # Get x ticks, rotate labels, and return
+ xticklab2=ax[i,j].get_xticklabels(which='both')
+ ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
+ ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+ # Set y-axis label
+ ax[i,j].set_ylabel(target,fontdict=fontAxis)
+
+ # Get, set, and format y-axis Price labels
+ yticklab = ax[i,j].get_yticklabels()
+ ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
+ ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+ # ax[i,j].get_yaxis().set_major_formatter(tickPrice)
+
+ # Set y-grid
+ ax[i, j].set_axisbelow(True)
+ ax[i, j].grid(axis='y',ls='--')
+
+ ## ---------- Final layout adjustments ----------- ##
+ # Deleted unused subplots
+ fig.delaxes(ax[1,1])
+ fig.delaxes(ax[1,0])
+
+ # Optimizing spatial layout
+ fig.tight_layout()
+ figtitle=column+'_dist_regr_plots.png'
+# plt.savefig(figtitle)
+ return
+
+# Tukey's method using IQR to eliminate
+def detect_outliers(df, n, features):
+ """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
+ Parameters:
+ df (DataFrame): DataFrane containing columns of features
+ n: default is 0, multiple outlier cutoff
+
+ Returns:
+ Index of outliers for .loc
+
+ Examples:
+ Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
+ df.loc[Outliers_to_drop] # Show the outliers rows
+ data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
+"""
+
+# Drop outliers
+
+ outlier_indices = []
+ # iterate over features(columns)
+ for col in features:
+
+ # 1st quartile (25%)
+ Q1 = np.percentile(df[col], 25)
+ # 3rd quartile (75%)
+ Q3 = np.percentile(df[col],75)
+
+ # Interquartile range (IQR)
+ IQR = Q3 - Q1
+ # outlier step
+ outlier_step = 1.5 * IQR
+
+ # Determine a list of indices of outliers for feature col
+ outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
+
+ # append the found outlier indices for col to the list of outlier indices
+ outlier_indices.extend(outlier_list_col)
+
+ # select observations containing more than 2 outliers
+ outlier_indices = Counter(outlier_indices)
+ multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
+ return multiple_outliers
+
+
+# describe_outliers -- calls detect_outliers
+def describe_outliers(df):
+ """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
+ """
+ out_count = 0
+ new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
+ for col in df.columns:
+ outies = detect_outliers(df[col])
+ out_count += len(outies)
+ new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
+ new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
+ return new_df
+
+
+#### Cohen's d
+def Cohen_d(group1, group2):
+ '''Compute Cohen's d.
+ # group1: Series or NumPy array
+ # group2: Series or NumPy array
+ # returns a floating point number
+ '''
+ diff = group1.mean() - group2.mean()
+
+ n1, n2 = len(group1), len(group2)
+ var1 = group1.var()
+ var2 = group2.var()
+
+ # Calculate the pooled threshold as shown earlier
+ pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
+
+ # Calculate Cohen's d statistic
+ d = diff / np.sqrt(pooled_var)
+
+ return d
+
+
+def plot_pdfs(cohen_d=2):
+ """Plot PDFs for distributions that differ by some number of stds.
+
+ cohen_d: number of standard deviations between the means
+ """
+ group1 = scipy.stats.norm(0, 1)
+ group2 = scipy.stats.norm(cohen_d, 1)
+ xs, ys = evaluate_PDF(group1)
+ pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
+
+ xs, ys = evaluate_PDF(group2)
+ pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
+
+ o, s = overlap_superiority(group1, group2)
+ print('overlap', o)
+ print('superiority', s)
+
+def list2df(list):#, sort_values='index'):
+ """ Take in a list where row[0] = column_names and outputs a dataframe.
+
+ Keyword arguments:
+ set_index -- df.set_index(set_index)
+ sortby -- df.sorted()
+ """
+
+ df_list = pd.DataFrame(list[1:],columns=list[0])
+# df_list = df_list[1:]
+
+ return df_list
+
+def df_drop_regex(DF, regex_list):
+ '''Use a list of regex to remove columns names. Returns new df.
+
+ Parameters:
+ DF -- input dataframe to remove columns from.
+ regex_list -- list of string patterns or regexp to remove.
+
+ Returns:
+ df_cut -- input df without the dropped columns.
+ '''
+ df_cut = DF.copy()
+
+ for r in regex_list:
+
+ df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
+ print(f'Removed {r}\n')
+
+ return df_cut
+
+
+
+####### MIKE's PLOTTING
+# plotting order totals per month in violin plots
+
+def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
+
+ '''Plots a violin plot with horizontal mean line, inner stick lines'''
+
+ plt.style.use('dark_background')
+ fig,ax =plt.subplots(figsize=(12,10))
+
+
+ sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
+ saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
+
+ ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
+ ax.set_xticklabels(ticklabels)
+
+ plt.legend()
+ plt.show()
+ x= df_year_orders['month']
+ y= df_year_orders['order_total']
+ title = 'Order totals per month with or without discounts'
+ hue=df_year_orders['Discount']>0
+
+
+### Example usage
+# #First, declare variables to be plotted
+# x = df_year_orders['month']
+# y = df_year_orders['order_total']
+# ticks = [v for v in month_dict.values()]
+# title = 'Order totals per month with or without discounts'
+# hue = df_year_orders['Discount']>0
+
+### Then call function
+# make_violinplot(x,y,title,hue, ticks),
+
+
+###########
+def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
+ '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
+
+ ## ADDING add_gridspec usage
+ import pandas as pd
+ import numpy as np
+ from scipy.stats import sem
+
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import matplotlib.ticker as ticker
+
+ import seaborn as sns
+
+ from matplotlib import rcParams
+ from matplotlib import rc
+ rcParams['font.family'] = 'serif'
+
+
+
+
+ # Plot distributions of discounted vs full price groups
+ plt.style.use('default')
+ # with plt.style.context(('tableau-colorblind10')):
+ with plt.style.context(('seaborn-notebook')):
+
+
+
+ ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
+ # Axis Label fonts
+ fontSuptitle ={'fontsize': 22,
+ 'fontweight': 'bold',
+ 'fontfamily':'serif'}
+
+ fontTitle = {'fontsize': 10,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontAxis = {'fontsize': 10,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontTicks = {'fontsize': 8,
+ 'fontweight':'medium',
+ 'fontfamily':'serif'}
+
+
+ ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
+
+ plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
+
+ # Create fig object and declare figsize
+ fig = plt.figure(constrained_layout=True, figsize=(8,3))
+
+
+ # Define gridspec to create grid coordinates
+ gs = fig.add_gridspec(nrows=1,ncols=10)
+
+ # Assign grid space to ax with add_subplot
+ ax0 = fig.add_subplot(gs[0,0:7])
+ ax1 = fig.add_subplot(gs[0,7:10])
+
+ #Combine into 1 list
+ ax = [ax0,ax1]
+
+ ### ------------------ SUBPLOT 1 ------------------ ###
+
+ ## --------- Defining series1 and 2 for subplot 1------- ##
+ ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
+
+ # Group 1: data, label, hist_kws and kde_kws
+ plotS1 = {'data': series1, 'label': sname1.title(),
+
+ 'hist_kws' :
+ {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
+
+ 'kde_kws':
+ {'color':'gray', 'linestyle': '--', 'linewidth':2,
+ 'label':'kde'}}
+
+ # Group 2: data, label, hist_kws and kde_kws
+ plotS2 = {'data': series2,
+ 'label': sname2.title(),
+
+ 'hist_kws' :
+ {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
+
+
+ 'kde_kws':
+ {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
+
+ # plot group 1
+ sns.distplot(plotS1['data'], label=plotS1['label'],
+
+ hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
+
+ ax=ax[0])
+
+
+ # plot group 2
+ sns.distplot(plotS2['data'], label=plotS2['label'],
+
+ hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
+
+ ax=ax[0])
+
+
+ ax[0].set_xlabel(series1.name, fontdict=fontAxis)
+ ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
+
+ ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])
+ ax[0].legend()
+
+
+ ### ------------------ SUBPLOT 2 ------------------ ###
+
+ # Import scipy for error bars
+ from scipy.stats import sem
+
+ # Declare x y group labels(x) and bar heights(y)
+ x = [plotS1['label'], plotS2['label']]
+ y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
+
+ yerr = [sem(plotS1['data']), sem(plotS2['data'])]
+ err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
+
+ # Create the bar plot
+ ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
+
+
+ # Customize subplot 2
+ ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
+ ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
+ ax[1].set_xlabel('')
+
+ ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])
+ ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize'])
+
+ ax1=ax[1]
+ test = ax1.get_xticklabels()
+ labels = [x.get_text() for x in test]
+ ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
+
+# xlab = [x.get_text() for x in xlablist]
+# ax[1].set_xticklabels(xlab,rotation=45)
+
+# fig.savefig('H1_EDA_using_gridspec.png')
+# plt.tight_layout()
+ # print(f')
+ plt.show()
+
+ return fig,ax
+
+```
+
+
+```python
+
+```
+
+
+
+
+%package -n python3-JMI-MVM
+Summary: A collection of our functions and classes from bootcamp.
+Provides: python-JMI-MVM
+BuildRequires: python3-devel
+BuildRequires: python3-setuptools
+BuildRequires: python3-pip
+%description -n python3-JMI-MVM
+# JMI_MVM
+
+- A collection of tools created for botcmap.
+- More information to be added later.
+
+
+<h1>Table of Contents<span class="tocSkip"></span></h1>
+<div class="toc"><ul class="toc-item"></ul></div>
+
+
+```python
+name = "JMI_MVM"
+help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
+#functions.py
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+
+
+def calc_roc_auc(X_test,y_test,dtc,verbose=False):
+ """Tests the results of an already-fit classifer.
+ Takes X_test, y_test, classifer, verbose (True" print result)
+ Returns the AUC for the roc_curve as a %"""
+ y_pred = dtc.predict(X_test)
+
+ FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
+ roc_auc = auc(FP_rate,TP_rate)
+ roc_auc_perc = round(roc_auc*100,3)
+ # Your code here
+ if verbose:
+ print(f"roc_curve's auc = {roc_auc_perc}%")
+ return roc_auc_perc
+
+def tune_params(param_name, param_values):
+ """Takes in param_name to tune with param_values, plots train vs test AUC's.
+ Returns df_results and df_style with color coded results"""
+ res_list = [[param_name,'train_roc_auc','test_roc_auc']]
+
+ # Loop through all values in param_values
+ for value in param_values:
+ # Create Model, set params
+ dtc_temp = DecisionTreeClassifier(criterion='entropy')
+ params={param_name:value}
+ dtc_temp.set_params(**params)
+
+ # Fit model
+ dtc_temp.fit(X_train, y_train)
+
+ # Get roc_auc for training data
+ train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
+ # Get roc_auc for test data
+ test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
+ # Append value and results to res_list
+ res_list.append([value,train_roc_auc,test_res_roc_auc])
+
+ # Turn results into df_results (basically same as using list2df)
+ df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
+ df_results.set_index(param_name,inplace=True)
+
+ # Plot df_results
+ df_results.plot()
+
+ # Color-coded dataframe s
+ import seaborn as sns
+ cm = sns.light_palette("green", as_cmap=True)
+ df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())
+
+ return df_results, df_syle
+
+
+# MULTIPLOT
+from string import ascii_letters
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def multiplot(df):
+ """Plots results from df.corr() in a correlation heat map for multicollinearity.
+ Returns fig, ax objects"""
+ sns.set(style="white")
+
+ # Compute the correlation matrix
+ corr = df.corr()
+
+ # Generate a mask for the upper triangle
+ mask = np.zeros_like(corr, dtype=np.bool)
+ mask[np.triu_indices_from(mask)] = True
+
+ # Set up the matplotlib figure
+ f, ax = plt.subplots(figsize=(16, 16))
+
+ # Generate a custom diverging colormap
+ cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+ # Draw the heatmap with the mask and correct aspect ratio
+ sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
+
+ square=True, linewidths=.5, cbar_kws={"shrink": .5})
+ return f, ax
+
+
+
+# Plots histogram and scatter (vs price) side by side
+# Plots histogram and scatter (vs price) side by side
+def plot_hist_scat_sns(df, target='index'):
+ """Plots seaborne distplots and regplots for columns im datamframe vs target.
+
+ Parameters:
+ df (DataFrame): DataFrame.describe() columns will be used.
+ target = name of column containing target variable.assume first coluumn.
+
+ Returns:
+ Figures for each column vs target with 2 subplots.
+ """
+ import matplotlib.ticker as mtick
+ import matplotlib.pyplot as plt
+ import seaborn as sns
+
+ with plt.style.context(('dark_background')):
+ ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------##
+
+
+# plt.style.use('dark_background')
+ figsize=(9,7)
+
+ # Axis Label fonts
+ fontTitle = {'fontsize': 14,
+ 'fontweight': 'bold',
+ 'fontfamily':'serif'}
+
+ fontAxis = {'fontsize': 12,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontTicks = {'fontsize': 8,
+ 'fontweight':'medium',
+ 'fontfamily':'serif'}
+
+ # Formatting dollar sign labels
+ fmtPrice = '${x:,.0f}'
+ tickPrice = mtick.StrMethodFormatter(fmtPrice)
+
+
+ ### PLOTTING ----------------------------- ------------------------ ##
+
+ # Loop through dataframe to plot
+ for column in df.describe():
+# print(f'\nCurrent column: {column}')
+
+ # Create figure with subplots for current column
+ fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
+
+ ## SUBPLOT 1 --------------------------------------------------##
+ i,j = 0,0
+ ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+ # Define graphing keyword dictionaries for distplot (Subplot 1)
+ hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
+ kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
+
+ # Plot distplot on ax[i,j] using hist_kws and kde_kws
+ sns.distplot(df[column], norm_hist=True, kde=True,
+ hist_kws = hist_kws, kde_kws = kde_kws,
+ label=column+' histogram', ax=ax[i,j])
+
+
+ # Set x axis label
+ ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+ # Get x-ticks, rotate labels, and return
+ xticklab1 = ax[i,j].get_xticklabels(which = 'both')
+ ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
+ ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+ # Set y-label
+ ax[i,j].set_ylabel('Density',fontdict=fontAxis)
+ yticklab1=ax[i,j].get_yticklabels(which='both')
+ ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
+ ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+ # Set y-grid
+ ax[i, j].set_axisbelow(True)
+ ax[i, j].grid(axis='y',ls='--')
+
+
+
+
+ ## SUBPLOT 2-------------------------------------------------- ##
+ i,j = 0,1
+ ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+ # Define the kwd dictionaries for scatter and regression line (subplot 2)
+ line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
+ scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
+
+ # Plot regplot on ax[i,j] using line_kws and scatter_kws
+ sns.regplot(df[column], df[target],
+ line_kws = line_kws,
+ scatter_kws = scatter_kws,
+ ax=ax[i,j])
+
+ # Set x-axis label
+ ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+ # Get x ticks, rotate labels, and return
+ xticklab2=ax[i,j].get_xticklabels(which='both')
+ ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
+ ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+ # Set y-axis label
+ ax[i,j].set_ylabel(target,fontdict=fontAxis)
+
+ # Get, set, and format y-axis Price labels
+ yticklab = ax[i,j].get_yticklabels()
+ ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
+ ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+ # ax[i,j].get_yaxis().set_major_formatter(tickPrice)
+
+ # Set y-grid
+ ax[i, j].set_axisbelow(True)
+ ax[i, j].grid(axis='y',ls='--')
+
+ ## ---------- Final layout adjustments ----------- ##
+ # Deleted unused subplots
+ fig.delaxes(ax[1,1])
+ fig.delaxes(ax[1,0])
+
+ # Optimizing spatial layout
+ fig.tight_layout()
+ figtitle=column+'_dist_regr_plots.png'
+# plt.savefig(figtitle)
+ return
+
+# Tukey's method using IQR to eliminate
+def detect_outliers(df, n, features):
+ """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
+ Parameters:
+ df (DataFrame): DataFrane containing columns of features
+ n: default is 0, multiple outlier cutoff
+
+ Returns:
+ Index of outliers for .loc
+
+ Examples:
+ Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
+ df.loc[Outliers_to_drop] # Show the outliers rows
+ data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
+"""
+
+# Drop outliers
+
+ outlier_indices = []
+ # iterate over features(columns)
+ for col in features:
+
+ # 1st quartile (25%)
+ Q1 = np.percentile(df[col], 25)
+ # 3rd quartile (75%)
+ Q3 = np.percentile(df[col],75)
+
+ # Interquartile range (IQR)
+ IQR = Q3 - Q1
+ # outlier step
+ outlier_step = 1.5 * IQR
+
+ # Determine a list of indices of outliers for feature col
+ outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
+
+ # append the found outlier indices for col to the list of outlier indices
+ outlier_indices.extend(outlier_list_col)
+
+ # select observations containing more than 2 outliers
+ outlier_indices = Counter(outlier_indices)
+ multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
+ return multiple_outliers
+
+
+# describe_outliers -- calls detect_outliers
+def describe_outliers(df):
+ """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
+ """
+ out_count = 0
+ new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
+ for col in df.columns:
+ outies = detect_outliers(df[col])
+ out_count += len(outies)
+ new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
+ new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
+ return new_df
+
+
+#### Cohen's d
+def Cohen_d(group1, group2):
+ '''Compute Cohen's d.
+ # group1: Series or NumPy array
+ # group2: Series or NumPy array
+ # returns a floating point number
+ '''
+ diff = group1.mean() - group2.mean()
+
+ n1, n2 = len(group1), len(group2)
+ var1 = group1.var()
+ var2 = group2.var()
+
+ # Calculate the pooled threshold as shown earlier
+ pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
+
+ # Calculate Cohen's d statistic
+ d = diff / np.sqrt(pooled_var)
+
+ return d
+
+
+def plot_pdfs(cohen_d=2):
+ """Plot PDFs for distributions that differ by some number of stds.
+
+ cohen_d: number of standard deviations between the means
+ """
+ group1 = scipy.stats.norm(0, 1)
+ group2 = scipy.stats.norm(cohen_d, 1)
+ xs, ys = evaluate_PDF(group1)
+ pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
+
+ xs, ys = evaluate_PDF(group2)
+ pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
+
+ o, s = overlap_superiority(group1, group2)
+ print('overlap', o)
+ print('superiority', s)
+
+def list2df(list):#, sort_values='index'):
+ """ Take in a list where row[0] = column_names and outputs a dataframe.
+
+ Keyword arguments:
+ set_index -- df.set_index(set_index)
+ sortby -- df.sorted()
+ """
+
+ df_list = pd.DataFrame(list[1:],columns=list[0])
+# df_list = df_list[1:]
+
+ return df_list
+
+def df_drop_regex(DF, regex_list):
+ '''Use a list of regex to remove columns names. Returns new df.
+
+ Parameters:
+ DF -- input dataframe to remove columns from.
+ regex_list -- list of string patterns or regexp to remove.
+
+ Returns:
+ df_cut -- input df without the dropped columns.
+ '''
+ df_cut = DF.copy()
+
+ for r in regex_list:
+
+ df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
+ print(f'Removed {r}\n')
+
+ return df_cut
+
+
+
+####### MIKE's PLOTTING
+# plotting order totals per month in violin plots
+
+def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
+
+ '''Plots a violin plot with horizontal mean line, inner stick lines'''
+
+ plt.style.use('dark_background')
+ fig,ax =plt.subplots(figsize=(12,10))
+
+
+ sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
+ saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
+
+ ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
+ ax.set_xticklabels(ticklabels)
+
+ plt.legend()
+ plt.show()
+ x= df_year_orders['month']
+ y= df_year_orders['order_total']
+ title = 'Order totals per month with or without discounts'
+ hue=df_year_orders['Discount']>0
+
+
+### Example usage
+# #First, declare variables to be plotted
+# x = df_year_orders['month']
+# y = df_year_orders['order_total']
+# ticks = [v for v in month_dict.values()]
+# title = 'Order totals per month with or without discounts'
+# hue = df_year_orders['Discount']>0
+
+### Then call function
+# make_violinplot(x,y,title,hue, ticks),
+
+
+###########
+def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
+ '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
+
+ ## ADDING add_gridspec usage
+ import pandas as pd
+ import numpy as np
+ from scipy.stats import sem
+
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import matplotlib.ticker as ticker
+
+ import seaborn as sns
+
+ from matplotlib import rcParams
+ from matplotlib import rc
+ rcParams['font.family'] = 'serif'
+
+
+
+
+ # Plot distributions of discounted vs full price groups
+ plt.style.use('default')
+ # with plt.style.context(('tableau-colorblind10')):
+ with plt.style.context(('seaborn-notebook')):
+
+
+
+ ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
+ # Axis Label fonts
+ fontSuptitle ={'fontsize': 22,
+ 'fontweight': 'bold',
+ 'fontfamily':'serif'}
+
+ fontTitle = {'fontsize': 10,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontAxis = {'fontsize': 10,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontTicks = {'fontsize': 8,
+ 'fontweight':'medium',
+ 'fontfamily':'serif'}
+
+
+ ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
+
+ plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
+
+ # Create fig object and declare figsize
+ fig = plt.figure(constrained_layout=True, figsize=(8,3))
+
+
+ # Define gridspec to create grid coordinates
+ gs = fig.add_gridspec(nrows=1,ncols=10)
+
+ # Assign grid space to ax with add_subplot
+ ax0 = fig.add_subplot(gs[0,0:7])
+ ax1 = fig.add_subplot(gs[0,7:10])
+
+ #Combine into 1 list
+ ax = [ax0,ax1]
+
+ ### ------------------ SUBPLOT 1 ------------------ ###
+
+ ## --------- Defining series1 and 2 for subplot 1------- ##
+ ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
+
+ # Group 1: data, label, hist_kws and kde_kws
+ plotS1 = {'data': series1, 'label': sname1.title(),
+
+ 'hist_kws' :
+ {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
+
+ 'kde_kws':
+ {'color':'gray', 'linestyle': '--', 'linewidth':2,
+ 'label':'kde'}}
+
+ # Group 2: data, label, hist_kws and kde_kws
+ plotS2 = {'data': series2,
+ 'label': sname2.title(),
+
+ 'hist_kws' :
+ {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
+
+
+ 'kde_kws':
+ {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
+
+ # plot group 1
+ sns.distplot(plotS1['data'], label=plotS1['label'],
+
+ hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
+
+ ax=ax[0])
+
+
+ # plot group 2
+ sns.distplot(plotS2['data'], label=plotS2['label'],
+
+ hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
+
+ ax=ax[0])
+
+
+ ax[0].set_xlabel(series1.name, fontdict=fontAxis)
+ ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
+
+ ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])
+ ax[0].legend()
+
+
+ ### ------------------ SUBPLOT 2 ------------------ ###
+
+ # Import scipy for error bars
+ from scipy.stats import sem
+
+ # Declare x y group labels(x) and bar heights(y)
+ x = [plotS1['label'], plotS2['label']]
+ y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
+
+ yerr = [sem(plotS1['data']), sem(plotS2['data'])]
+ err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
+
+ # Create the bar plot
+ ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
+
+
+ # Customize subplot 2
+ ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
+ ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
+ ax[1].set_xlabel('')
+
+ ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])
+ ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize'])
+
+ ax1=ax[1]
+ test = ax1.get_xticklabels()
+ labels = [x.get_text() for x in test]
+ ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
+
+# xlab = [x.get_text() for x in xlablist]
+# ax[1].set_xticklabels(xlab,rotation=45)
+
+# fig.savefig('H1_EDA_using_gridspec.png')
+# plt.tight_layout()
+ # print(f')
+ plt.show()
+
+ return fig,ax
+
+```
+
+
+```python
+
+```
+
+
+
+
+%package help
+Summary: Development documents and examples for JMI-MVM
+Provides: python3-JMI-MVM-doc
+%description help
+# JMI_MVM
+
+- A collection of tools created for botcmap.
+- More information to be added later.
+
+
+<h1>Table of Contents<span class="tocSkip"></span></h1>
+<div class="toc"><ul class="toc-item"></ul></div>
+
+
+```python
+name = "JMI_MVM"
+help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
+#functions.py
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+
+
+def calc_roc_auc(X_test,y_test,dtc,verbose=False):
+ """Tests the results of an already-fit classifer.
+ Takes X_test, y_test, classifer, verbose (True" print result)
+ Returns the AUC for the roc_curve as a %"""
+ y_pred = dtc.predict(X_test)
+
+ FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
+ roc_auc = auc(FP_rate,TP_rate)
+ roc_auc_perc = round(roc_auc*100,3)
+ # Your code here
+ if verbose:
+ print(f"roc_curve's auc = {roc_auc_perc}%")
+ return roc_auc_perc
+
+def tune_params(param_name, param_values):
+ """Takes in param_name to tune with param_values, plots train vs test AUC's.
+ Returns df_results and df_style with color coded results"""
+ res_list = [[param_name,'train_roc_auc','test_roc_auc']]
+
+ # Loop through all values in param_values
+ for value in param_values:
+ # Create Model, set params
+ dtc_temp = DecisionTreeClassifier(criterion='entropy')
+ params={param_name:value}
+ dtc_temp.set_params(**params)
+
+ # Fit model
+ dtc_temp.fit(X_train, y_train)
+
+ # Get roc_auc for training data
+ train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
+ # Get roc_auc for test data
+ test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
+ # Append value and results to res_list
+ res_list.append([value,train_roc_auc,test_res_roc_auc])
+
+ # Turn results into df_results (basically same as using list2df)
+ df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
+ df_results.set_index(param_name,inplace=True)
+
+ # Plot df_results
+ df_results.plot()
+
+ # Color-coded dataframe s
+ import seaborn as sns
+ cm = sns.light_palette("green", as_cmap=True)
+ df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())
+
+ return df_results, df_syle
+
+
+# MULTIPLOT
+from string import ascii_letters
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def multiplot(df):
+ """Plots results from df.corr() in a correlation heat map for multicollinearity.
+ Returns fig, ax objects"""
+ sns.set(style="white")
+
+ # Compute the correlation matrix
+ corr = df.corr()
+
+ # Generate a mask for the upper triangle
+ mask = np.zeros_like(corr, dtype=np.bool)
+ mask[np.triu_indices_from(mask)] = True
+
+ # Set up the matplotlib figure
+ f, ax = plt.subplots(figsize=(16, 16))
+
+ # Generate a custom diverging colormap
+ cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+ # Draw the heatmap with the mask and correct aspect ratio
+ sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
+
+ square=True, linewidths=.5, cbar_kws={"shrink": .5})
+ return f, ax
+
+
+
+# Plots histogram and scatter (vs price) side by side
+# Plots histogram and scatter (vs price) side by side
+def plot_hist_scat_sns(df, target='index'):
+ """Plots seaborne distplots and regplots for columns im datamframe vs target.
+
+ Parameters:
+ df (DataFrame): DataFrame.describe() columns will be used.
+ target = name of column containing target variable.assume first coluumn.
+
+ Returns:
+ Figures for each column vs target with 2 subplots.
+ """
+ import matplotlib.ticker as mtick
+ import matplotlib.pyplot as plt
+ import seaborn as sns
+
+ with plt.style.context(('dark_background')):
+ ### DEFINE AESTHETIC CUSTOMIZATIONS -------------------------------##
+
+
+# plt.style.use('dark_background')
+ figsize=(9,7)
+
+ # Axis Label fonts
+ fontTitle = {'fontsize': 14,
+ 'fontweight': 'bold',
+ 'fontfamily':'serif'}
+
+ fontAxis = {'fontsize': 12,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontTicks = {'fontsize': 8,
+ 'fontweight':'medium',
+ 'fontfamily':'serif'}
+
+ # Formatting dollar sign labels
+ fmtPrice = '${x:,.0f}'
+ tickPrice = mtick.StrMethodFormatter(fmtPrice)
+
+
+ ### PLOTTING ----------------------------- ------------------------ ##
+
+ # Loop through dataframe to plot
+ for column in df.describe():
+# print(f'\nCurrent column: {column}')
+
+ # Create figure with subplots for current column
+ fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
+
+ ## SUBPLOT 1 --------------------------------------------------##
+ i,j = 0,0
+ ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+ # Define graphing keyword dictionaries for distplot (Subplot 1)
+ hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
+ kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
+
+ # Plot distplot on ax[i,j] using hist_kws and kde_kws
+ sns.distplot(df[column], norm_hist=True, kde=True,
+ hist_kws = hist_kws, kde_kws = kde_kws,
+ label=column+' histogram', ax=ax[i,j])
+
+
+ # Set x axis label
+ ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+ # Get x-ticks, rotate labels, and return
+ xticklab1 = ax[i,j].get_xticklabels(which = 'both')
+ ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
+ ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+ # Set y-label
+ ax[i,j].set_ylabel('Density',fontdict=fontAxis)
+ yticklab1=ax[i,j].get_yticklabels(which='both')
+ ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
+ ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+ # Set y-grid
+ ax[i, j].set_axisbelow(True)
+ ax[i, j].grid(axis='y',ls='--')
+
+
+
+
+ ## SUBPLOT 2-------------------------------------------------- ##
+ i,j = 0,1
+ ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+ # Define the kwd dictionaries for scatter and regression line (subplot 2)
+ line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
+ scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
+
+ # Plot regplot on ax[i,j] using line_kws and scatter_kws
+ sns.regplot(df[column], df[target],
+ line_kws = line_kws,
+ scatter_kws = scatter_kws,
+ ax=ax[i,j])
+
+ # Set x-axis label
+ ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+ # Get x ticks, rotate labels, and return
+ xticklab2=ax[i,j].get_xticklabels(which='both')
+ ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
+ ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+ # Set y-axis label
+ ax[i,j].set_ylabel(target,fontdict=fontAxis)
+
+ # Get, set, and format y-axis Price labels
+ yticklab = ax[i,j].get_yticklabels()
+ ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
+ ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+ # ax[i,j].get_yaxis().set_major_formatter(tickPrice)
+
+ # Set y-grid
+ ax[i, j].set_axisbelow(True)
+ ax[i, j].grid(axis='y',ls='--')
+
+ ## ---------- Final layout adjustments ----------- ##
+ # Deleted unused subplots
+ fig.delaxes(ax[1,1])
+ fig.delaxes(ax[1,0])
+
+ # Optimizing spatial layout
+ fig.tight_layout()
+ figtitle=column+'_dist_regr_plots.png'
+# plt.savefig(figtitle)
+ return
+
+# Tukey's method using IQR to eliminate
+def detect_outliers(df, n, features):
+ """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
+ Parameters:
+ df (DataFrame): DataFrane containing columns of features
+ n: default is 0, multiple outlier cutoff
+
+ Returns:
+ Index of outliers for .loc
+
+ Examples:
+ Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
+ df.loc[Outliers_to_drop] # Show the outliers rows
+ data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
+"""
+
+# Drop outliers
+
+ outlier_indices = []
+ # iterate over features(columns)
+ for col in features:
+
+ # 1st quartile (25%)
+ Q1 = np.percentile(df[col], 25)
+ # 3rd quartile (75%)
+ Q3 = np.percentile(df[col],75)
+
+ # Interquartile range (IQR)
+ IQR = Q3 - Q1
+ # outlier step
+ outlier_step = 1.5 * IQR
+
+ # Determine a list of indices of outliers for feature col
+ outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
+
+ # append the found outlier indices for col to the list of outlier indices
+ outlier_indices.extend(outlier_list_col)
+
+ # select observations containing more than 2 outliers
+ outlier_indices = Counter(outlier_indices)
+ multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
+ return multiple_outliers
+
+
+# describe_outliers -- calls detect_outliers
+def describe_outliers(df):
+ """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
+ """
+ out_count = 0
+ new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
+ for col in df.columns:
+ outies = detect_outliers(df[col])
+ out_count += len(outies)
+ new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
+ new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
+ return new_df
+
+
+#### Cohen's d
+def Cohen_d(group1, group2):
+ '''Compute Cohen's d.
+ # group1: Series or NumPy array
+ # group2: Series or NumPy array
+ # returns a floating point number
+ '''
+ diff = group1.mean() - group2.mean()
+
+ n1, n2 = len(group1), len(group2)
+ var1 = group1.var()
+ var2 = group2.var()
+
+ # Calculate the pooled threshold as shown earlier
+ pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
+
+ # Calculate Cohen's d statistic
+ d = diff / np.sqrt(pooled_var)
+
+ return d
+
+
+def plot_pdfs(cohen_d=2):
+ """Plot PDFs for distributions that differ by some number of stds.
+
+ cohen_d: number of standard deviations between the means
+ """
+ group1 = scipy.stats.norm(0, 1)
+ group2 = scipy.stats.norm(cohen_d, 1)
+ xs, ys = evaluate_PDF(group1)
+ pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
+
+ xs, ys = evaluate_PDF(group2)
+ pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
+
+ o, s = overlap_superiority(group1, group2)
+ print('overlap', o)
+ print('superiority', s)
+
+def list2df(list):#, sort_values='index'):
+ """ Take in a list where row[0] = column_names and outputs a dataframe.
+
+ Keyword arguments:
+ set_index -- df.set_index(set_index)
+ sortby -- df.sorted()
+ """
+
+ df_list = pd.DataFrame(list[1:],columns=list[0])
+# df_list = df_list[1:]
+
+ return df_list
+
+def df_drop_regex(DF, regex_list):
+ '''Use a list of regex to remove columns names. Returns new df.
+
+ Parameters:
+ DF -- input dataframe to remove columns from.
+ regex_list -- list of string patterns or regexp to remove.
+
+ Returns:
+ df_cut -- input df without the dropped columns.
+ '''
+ df_cut = DF.copy()
+
+ for r in regex_list:
+
+ df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
+ print(f'Removed {r}\n')
+
+ return df_cut
+
+
+
+####### MIKE's PLOTTING
+# plotting order totals per month in violin plots
+
+def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
+
+ '''Plots a violin plot with horizontal mean line, inner stick lines'''
+
+ plt.style.use('dark_background')
+ fig,ax =plt.subplots(figsize=(12,10))
+
+
+ sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
+ saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
+
+ ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
+ ax.set_xticklabels(ticklabels)
+
+ plt.legend()
+ plt.show()
+ x= df_year_orders['month']
+ y= df_year_orders['order_total']
+ title = 'Order totals per month with or without discounts'
+ hue=df_year_orders['Discount']>0
+
+
+### Example usage
+# #First, declare variables to be plotted
+# x = df_year_orders['month']
+# y = df_year_orders['order_total']
+# ticks = [v for v in month_dict.values()]
+# title = 'Order totals per month with or without discounts'
+# hue = df_year_orders['Discount']>0
+
+### Then call function
+# make_violinplot(x,y,title,hue, ticks),
+
+
+###########
+def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
+ '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
+
+ ## ADDING add_gridspec usage
+ import pandas as pd
+ import numpy as np
+ from scipy.stats import sem
+
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import matplotlib.ticker as ticker
+
+ import seaborn as sns
+
+ from matplotlib import rcParams
+ from matplotlib import rc
+ rcParams['font.family'] = 'serif'
+
+
+
+
+ # Plot distributions of discounted vs full price groups
+ plt.style.use('default')
+ # with plt.style.context(('tableau-colorblind10')):
+ with plt.style.context(('seaborn-notebook')):
+
+
+
+ ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
+ # Axis Label fonts
+ fontSuptitle ={'fontsize': 22,
+ 'fontweight': 'bold',
+ 'fontfamily':'serif'}
+
+ fontTitle = {'fontsize': 10,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontAxis = {'fontsize': 10,
+ 'fontweight': 'medium',
+ 'fontfamily':'serif'}
+
+ fontTicks = {'fontsize': 8,
+ 'fontweight':'medium',
+ 'fontfamily':'serif'}
+
+
+ ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
+
+ plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
+
+ # Create fig object and declare figsize
+ fig = plt.figure(constrained_layout=True, figsize=(8,3))
+
+
+ # Define gridspec to create grid coordinates
+ gs = fig.add_gridspec(nrows=1,ncols=10)
+
+ # Assign grid space to ax with add_subplot
+ ax0 = fig.add_subplot(gs[0,0:7])
+ ax1 = fig.add_subplot(gs[0,7:10])
+
+ #Combine into 1 list
+ ax = [ax0,ax1]
+
+ ### ------------------ SUBPLOT 1 ------------------ ###
+
+ ## --------- Defining series1 and 2 for subplot 1------- ##
+ ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
+
+ # Group 1: data, label, hist_kws and kde_kws
+ plotS1 = {'data': series1, 'label': sname1.title(),
+
+ 'hist_kws' :
+ {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
+
+ 'kde_kws':
+ {'color':'gray', 'linestyle': '--', 'linewidth':2,
+ 'label':'kde'}}
+
+ # Group 2: data, label, hist_kws and kde_kws
+ plotS2 = {'data': series2,
+ 'label': sname2.title(),
+
+ 'hist_kws' :
+ {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
+
+
+ 'kde_kws':
+ {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
+
+ # plot group 1
+ sns.distplot(plotS1['data'], label=plotS1['label'],
+
+ hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
+
+ ax=ax[0])
+
+
+ # plot group 2
+ sns.distplot(plotS2['data'], label=plotS2['label'],
+
+ hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
+
+ ax=ax[0])
+
+
+ ax[0].set_xlabel(series1.name, fontdict=fontAxis)
+ ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
+
+ ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])
+ ax[0].legend()
+
+
+ ### ------------------ SUBPLOT 2 ------------------ ###
+
+ # Import scipy for error bars
+ from scipy.stats import sem
+
+ # Declare x y group labels(x) and bar heights(y)
+ x = [plotS1['label'], plotS2['label']]
+ y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
+
+ yerr = [sem(plotS1['data']), sem(plotS2['data'])]
+ err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
+
+ # Create the bar plot
+ ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
+
+
+ # Customize subplot 2
+ ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
+ ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
+ ax[1].set_xlabel('')
+
+ ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])
+ ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize'])
+
+ ax1=ax[1]
+ test = ax1.get_xticklabels()
+ labels = [x.get_text() for x in test]
+ ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
+
+# xlab = [x.get_text() for x in xlablist]
+# ax[1].set_xticklabels(xlab,rotation=45)
+
+# fig.savefig('H1_EDA_using_gridspec.png')
+# plt.tight_layout()
+ # print(f')
+ plt.show()
+
+ return fig,ax
+
+```
+
+
+```python
+
+```
+
+
+
+
+%prep
+%autosetup -n JMI-MVM-0.3.4
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+ find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+ find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+ find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+ find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+ find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-JMI-MVM -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Wed May 31 2023 Python_Bot <Python_Bot@openeuler.org> - 0.3.4-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..9e6f7f1
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+add6261816be7d1396c08729e51554ca JMI_MVM-0.3.4.tar.gz