automatic import of python-jmi-mvm

author: CoprDistGit <infra@openeuler.org> 2023-05-31 03:14:01 +0000
committer: CoprDistGit <infra@openeuler.org> 2023-05-31 03:14:01 +0000
commit: 95e14620f51ced7d4d299a68b214d6a01b183f92 (patch)
tree: a6c1c0e23853f88aa12e4b9f73e335c3d3bd9f79
parent: 3acf606755cc639eec50667e570f263520529812 (diff)
3 files changed, 1795 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..fdc752e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/JMI_MVM-0.3.4.tar.gz
diff --git a/python-jmi-mvm.spec b/python-jmi-mvm.spec
new file mode 100644
index 0000000..da60e6d
--- /dev/null
+++ b/python-jmi-mvm.spec
@@ -0,0 +1,1793 @@
+%global _empty_manifest_terminate_build 0
+Name:		python-JMI-MVM
+Version:	0.3.4
+Release:	1
+Summary:	A collection of our functions and classes from bootcamp.
+License:	MIT License
+URL:		https://github.com/jirvingphd/JMI_MVM
+Source0:	https://mirrors.nju.edu.cn/pypi/web/packages/2c/81/93d0dcde8641c7785f13d50de7ad1117ebf74856ad41f2790e6b31e80883/JMI_MVM-0.3.4.tar.gz
+BuildArch:	noarch
+
+Requires:	python3-numpy
+Requires:	python3-pandas
+Requires:	python3-seaborn
+Requires:	python3-matplotlib
+Requires:	python3-sklearn
+Requires:	python3-pydotplus
+Requires:	python3-scipy
+Requires:	python3-xgboost
+
+%description
+# JMI_MVM
+
+- A collection of tools created for botcmap. 
+- More information to be added later.
+
+
+<h1>Table of Contents<span class="tocSkip"></span></h1>
+<div class="toc"><ul class="toc-item"></ul></div>
+
+
+```python
+name = "JMI_MVM"
+help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
+#functions.py
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+
+
+def calc_roc_auc(X_test,y_test,dtc,verbose=False):
+    """Tests the results of an already-fit classifer. 
+    Takes X_test, y_test, classifer, verbose (True" print result)
+    Returns the AUC for the roc_curve as a %"""
+    y_pred = dtc.predict(X_test)
+
+    FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
+    roc_auc = auc(FP_rate,TP_rate)
+    roc_auc_perc = round(roc_auc*100,3)
+    # Your code here 
+    if verbose:
+        print(f"roc_curve's auc = {roc_auc_perc}%")
+    return roc_auc_perc
+
+def tune_params(param_name, param_values):
+    """Takes in param_name to tune with param_values, plots train vs test AUC's. 
+    Returns df_results and df_style with color coded results"""
+    res_list = [[param_name,'train_roc_auc','test_roc_auc']]
+
+    # Loop through all values in param_values
+    for value in param_values:
+        # Create Model, set params
+        dtc_temp = DecisionTreeClassifier(criterion='entropy')
+        params={param_name:value}
+        dtc_temp.set_params(**params)
+
+        # Fit model
+        dtc_temp.fit(X_train, y_train)
+
+        # Get roc_auc for training data
+        train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
+        # Get roc_auc for test data
+        test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
+        # Append value and results to res_list
+        res_list.append([value,train_roc_auc,test_res_roc_auc])
+
+    # Turn results into df_results (basically same as using list2df)
+    df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
+    df_results.set_index(param_name,inplace=True)
+
+    # Plot df_results
+    df_results.plot()
+
+    # Color-coded dataframe s
+    import seaborn as sns
+    cm = sns.light_palette("green", as_cmap=True)
+    df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())
+
+    return df_results, df_syle
+
+
+# MULTIPLOT
+from string import ascii_letters
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def multiplot(df):
+    """Plots results from df.corr() in a correlation heat map for multicollinearity.
+    Returns fig, ax objects"""
+    sns.set(style="white")
+
+    # Compute the correlation matrix
+    corr = df.corr()
+
+    # Generate a mask for the upper triangle
+    mask = np.zeros_like(corr, dtype=np.bool)
+    mask[np.triu_indices_from(mask)] = True
+
+    # Set up the matplotlib figure
+    f, ax = plt.subplots(figsize=(16, 16))
+
+    # Generate a custom diverging colormap
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+    # Draw the heatmap with the mask and correct aspect ratio
+    sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
+
+    square=True, linewidths=.5, cbar_kws={"shrink": .5})
+    return f, ax
+
+
+
+# Plots histogram and scatter (vs price) side by side
+# Plots histogram and scatter (vs price) side by side
+def plot_hist_scat_sns(df, target='index'):
+    """Plots seaborne distplots and regplots for columns im datamframe vs target.
+
+    Parameters:
+    df (DataFrame): DataFrame.describe() columns will be used. 
+    target = name of column containing target variable.assume first coluumn. 
+
+    Returns:
+    Figures for each column vs target with 2 subplots.
+   """
+    import matplotlib.ticker as mtick
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    with plt.style.context(('dark_background')):
+        ###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------##
+
+
+#         plt.style.use('dark_background')
+        figsize=(9,7)
+
+        # Axis Label fonts
+        fontTitle = {'fontsize': 14,
+                   'fontweight': 'bold',
+                    'fontfamily':'serif'}
+
+        fontAxis = {'fontsize': 12,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontTicks = {'fontsize': 8,
+                   'fontweight':'medium',
+                    'fontfamily':'serif'}
+
+        # Formatting dollar sign labels
+        fmtPrice = '${x:,.0f}'
+        tickPrice = mtick.StrMethodFormatter(fmtPrice)
+
+
+        ###  PLOTTING ----------------------------- ------------------------ ##
+
+        # Loop through dataframe to plot
+        for column in df.describe():
+#             print(f'\nCurrent column: {column}')
+
+            # Create figure with subplots for current column
+            fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
+
+            ##  SUBPLOT 1 --------------------------------------------------##
+            i,j = 0,0
+            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+            # Define graphing keyword dictionaries for distplot (Subplot 1)
+            hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
+            kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
+
+            # Plot distplot on ax[i,j] using hist_kws and kde_kws
+            sns.distplot(df[column], norm_hist=True, kde=True,
+                         hist_kws = hist_kws, kde_kws = kde_kws,
+                         label=column+' histogram', ax=ax[i,j])
+
+
+            # Set x axis label
+            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+            # Get x-ticks, rotate labels, and return
+            xticklab1 = ax[i,j].get_xticklabels(which = 'both')
+            ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
+            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+            # Set y-label 
+            ax[i,j].set_ylabel('Density',fontdict=fontAxis)
+            yticklab1=ax[i,j].get_yticklabels(which='both')
+            ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
+            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+            # Set y-grid
+            ax[i, j].set_axisbelow(True)
+            ax[i, j].grid(axis='y',ls='--')
+
+
+
+
+            ##  SUBPLOT 2-------------------------------------------------- ##
+            i,j = 0,1
+            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+            # Define the kwd dictionaries for scatter and regression line (subplot 2)
+            line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
+            scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
+
+            # Plot regplot on ax[i,j] using line_kws and scatter_kws
+            sns.regplot(df[column], df[target], 
+                        line_kws = line_kws,
+                        scatter_kws = scatter_kws,
+                        ax=ax[i,j])
+
+            # Set x-axis label
+            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+             # Get x ticks, rotate labels, and return
+            xticklab2=ax[i,j].get_xticklabels(which='both')
+            ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
+            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+            # Set  y-axis label
+            ax[i,j].set_ylabel(target,fontdict=fontAxis)
+
+            # Get, set, and format y-axis Price labels
+            yticklab = ax[i,j].get_yticklabels()
+            ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
+            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+    #         ax[i,j].get_yaxis().set_major_formatter(tickPrice) 
+
+            # Set y-grid
+            ax[i, j].set_axisbelow(True)
+            ax[i, j].grid(axis='y',ls='--')       
+
+            ## ---------- Final layout adjustments ----------- ##
+            # Deleted unused subplots 
+            fig.delaxes(ax[1,1])
+            fig.delaxes(ax[1,0])
+
+            # Optimizing spatial layout
+            fig.tight_layout()
+            figtitle=column+'_dist_regr_plots.png'
+#             plt.savefig(figtitle)
+    return 
+
+# Tukey's method using IQR to eliminate 
+def detect_outliers(df, n, features):
+    """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
+    Parameters:
+    df (DataFrame): DataFrane containing columns of features
+    n: default is 0, multiple outlier cutoff  
+
+    Returns:
+    Index of outliers for .loc
+
+    Examples:
+    Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
+    df.loc[Outliers_to_drop] # Show the outliers rows
+    data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
+"""
+
+# Drop outliers    
+
+    outlier_indices = []
+    # iterate over features(columns)
+    for col in features:
+
+        # 1st quartile (25%)
+        Q1 = np.percentile(df[col], 25)
+        # 3rd quartile (75%)
+        Q3 = np.percentile(df[col],75)
+
+        # Interquartile range (IQR)
+        IQR = Q3 - Q1
+        # outlier step
+        outlier_step = 1.5 * IQR
+
+        # Determine a list of indices of outliers for feature col
+        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
+
+        # append the found outlier indices for col to the list of outlier indices 
+        outlier_indices.extend(outlier_list_col)
+
+        # select observations containing more than 2 outliers
+        outlier_indices = Counter(outlier_indices)        
+        multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
+    return multiple_outliers 
+
+
+# describe_outliers -- calls detect_outliers
+def describe_outliers(df):
+    """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
+    """
+    out_count = 0
+    new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
+    for col in df.columns:
+        outies = detect_outliers(df[col])
+        out_count += len(outies) 
+        new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
+    new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
+    return new_df
+
+
+#### Cohen's d
+def Cohen_d(group1, group2):
+    '''Compute Cohen's d.
+    # group1: Series or NumPy array
+    # group2: Series or NumPy array
+    # returns a floating point number 
+    '''
+    diff = group1.mean() - group2.mean()
+
+    n1, n2 = len(group1), len(group2)
+    var1 = group1.var()
+    var2 = group2.var()
+
+    # Calculate the pooled threshold as shown earlier
+    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
+
+    # Calculate Cohen's d statistic
+    d = diff / np.sqrt(pooled_var)
+
+    return d
+
+
+def plot_pdfs(cohen_d=2):
+    """Plot PDFs for distributions that differ by some number of stds.
+
+    cohen_d: number of standard deviations between the means
+    """
+    group1 = scipy.stats.norm(0, 1)
+    group2 = scipy.stats.norm(cohen_d, 1)
+    xs, ys = evaluate_PDF(group1)
+    pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
+
+    xs, ys = evaluate_PDF(group2)
+    pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
+
+    o, s = overlap_superiority(group1, group2)
+    print('overlap', o)
+    print('superiority', s)
+
+def list2df(list):#, sort_values='index'):
+    """ Take in a list where row[0] = column_names and outputs a dataframe.
+
+    Keyword arguments:
+    set_index -- df.set_index(set_index)
+    sortby -- df.sorted()
+    """    
+
+    df_list = pd.DataFrame(list[1:],columns=list[0])
+#     df_list = df_list[1:]
+
+    return df_list
+
+def df_drop_regex(DF, regex_list):
+    '''Use a list of regex to remove columns names. Returns new df.
+
+    Parameters:
+        DF -- input dataframe to remove columns from.
+        regex_list -- list of string patterns or regexp to remove.
+
+    Returns:
+        df_cut -- input df without the dropped columns. 
+        '''
+    df_cut = DF.copy()
+
+    for r in regex_list:
+
+        df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
+        print(f'Removed {r}\n')
+
+    return df_cut
+
+
+
+####### MIKE's PLOTTING
+# plotting order totals per month in violin plots
+
+def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
+
+  '''Plots a violin plot with horizontal mean line, inner stick lines'''
+
+  plt.style.use('dark_background')
+  fig,ax =plt.subplots(figsize=(12,10))
+
+
+  sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
+                 saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
+
+  ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
+  ax.set_xticklabels(ticklabels)
+
+  plt.legend()
+  plt.show()
+  x= df_year_orders['month']
+  y= df_year_orders['order_total']
+  title = 'Order totals per month with or without discounts'
+  hue=df_year_orders['Discount']>0
+
+
+### Example usage
+# #First, declare variables to be plotted
+# x = df_year_orders['month']
+# y = df_year_orders['order_total']
+# ticks = [v for v in month_dict.values()] 
+# title = 'Order totals per month with or without discounts'
+# hue = df_year_orders['Discount']>0
+
+### Then call function
+# make_violinplot(x,y,title,hue, ticks), 
+
+
+###########
+def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
+    '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
+
+    ## ADDING add_gridspec usage
+    import pandas as pd
+    import numpy as np
+    from scipy.stats import sem
+
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    import matplotlib.ticker as ticker
+
+    import seaborn as sns
+
+    from matplotlib import rcParams
+    from matplotlib import rc
+    rcParams['font.family'] = 'serif'
+
+
+
+
+    # Plot distributions of discounted vs full price groups
+    plt.style.use('default')
+    # with plt.style.context(('tableau-colorblind10')):
+    with plt.style.context(('seaborn-notebook')):
+
+
+
+        ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
+       # Axis Label fonts
+        fontSuptitle ={'fontsize': 22,
+                   'fontweight': 'bold',
+                    'fontfamily':'serif'}
+
+        fontTitle = {'fontsize': 10,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontAxis = {'fontsize': 10,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontTicks = {'fontsize': 8,
+                   'fontweight':'medium', 
+                    'fontfamily':'serif'}
+
+
+        ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
+
+        plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
+
+        # Create fig object and declare figsize
+        fig = plt.figure(constrained_layout=True, figsize=(8,3))
+
+
+        # Define gridspec to create grid coordinates             
+        gs = fig.add_gridspec(nrows=1,ncols=10)
+
+        # Assign grid space to ax with add_subplot
+        ax0 = fig.add_subplot(gs[0,0:7])
+        ax1 = fig.add_subplot(gs[0,7:10])
+
+        #Combine into 1 list
+        ax = [ax0,ax1]
+
+        ### ------------------  SUBPLOT 1  ------------------ ###
+
+        ## --------- Defining series1 and 2 for subplot 1------- ##
+        ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
+
+        # Group 1: data, label, hist_kws and kde_kws
+        plotS1 = {'data': series1, 'label': sname1.title(),
+
+                   'hist_kws' :
+                    {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
+
+                   'kde_kws':
+                    {'color':'gray', 'linestyle': '--', 'linewidth':2,
+                     'label':'kde'}}
+
+        # Group 2: data, label, hist_kws and kde_kws
+        plotS2 = {'data': series2,
+                    'label': sname2.title(), 
+
+                    'hist_kws' :
+                    {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
+
+
+                    'kde_kws':
+                    {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
+
+        # plot group 1
+        sns.distplot(plotS1['data'], label=plotS1['label'],
+
+                     hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
+
+                     ax=ax[0])   
+
+
+        # plot group 2
+        sns.distplot(plotS2['data'], label=plotS2['label'],
+
+                     hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
+
+                     ax=ax[0])
+
+
+        ax[0].set_xlabel(series1.name, fontdict=fontAxis)
+        ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
+
+        ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])   
+        ax[0].legend()
+
+
+        ### ------------------  SUBPLOT 2  ------------------ ###
+
+        # Import scipy for error bars
+        from scipy.stats import sem
+
+        # Declare x y group labels(x) and bar heights(y)
+        x = [plotS1['label'], plotS2['label']]
+        y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
+
+        yerr = [sem(plotS1['data']), sem(plotS2['data'])]
+        err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
+
+        # Create the bar plot
+        ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
+
+
+        # Customize subplot 2
+        ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
+        ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
+        ax[1].set_xlabel('')
+
+        ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])
+        ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) 
+
+        ax1=ax[1]
+        test = ax1.get_xticklabels()
+        labels = [x.get_text() for x in test]
+        ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
+
+#         xlab = [x.get_text() for x in xlablist]
+#         ax[1].set_xticklabels(xlab,rotation=45)
+
+#         fig.savefig('H1_EDA_using_gridspec.png')
+#         plt.tight_layout()
+    #     print(f')
+        plt.show()
+
+        return fig,ax
+
+```
+
+
+```python
+
+```
+
+
+
+
+%package -n python3-JMI-MVM
+Summary:	A collection of our functions and classes from bootcamp.
+Provides:	python-JMI-MVM
+BuildRequires:	python3-devel
+BuildRequires:	python3-setuptools
+BuildRequires:	python3-pip
+%description -n python3-JMI-MVM
+# JMI_MVM
+
+- A collection of tools created for botcmap. 
+- More information to be added later.
+
+
+<h1>Table of Contents<span class="tocSkip"></span></h1>
+<div class="toc"><ul class="toc-item"></ul></div>
+
+
+```python
+name = "JMI_MVM"
+help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
+#functions.py
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+
+
+def calc_roc_auc(X_test,y_test,dtc,verbose=False):
+    """Tests the results of an already-fit classifer. 
+    Takes X_test, y_test, classifer, verbose (True" print result)
+    Returns the AUC for the roc_curve as a %"""
+    y_pred = dtc.predict(X_test)
+
+    FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
+    roc_auc = auc(FP_rate,TP_rate)
+    roc_auc_perc = round(roc_auc*100,3)
+    # Your code here 
+    if verbose:
+        print(f"roc_curve's auc = {roc_auc_perc}%")
+    return roc_auc_perc
+
+def tune_params(param_name, param_values):
+    """Takes in param_name to tune with param_values, plots train vs test AUC's. 
+    Returns df_results and df_style with color coded results"""
+    res_list = [[param_name,'train_roc_auc','test_roc_auc']]
+
+    # Loop through all values in param_values
+    for value in param_values:
+        # Create Model, set params
+        dtc_temp = DecisionTreeClassifier(criterion='entropy')
+        params={param_name:value}
+        dtc_temp.set_params(**params)
+
+        # Fit model
+        dtc_temp.fit(X_train, y_train)
+
+        # Get roc_auc for training data
+        train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
+        # Get roc_auc for test data
+        test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
+        # Append value and results to res_list
+        res_list.append([value,train_roc_auc,test_res_roc_auc])
+
+    # Turn results into df_results (basically same as using list2df)
+    df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
+    df_results.set_index(param_name,inplace=True)
+
+    # Plot df_results
+    df_results.plot()
+
+    # Color-coded dataframe s
+    import seaborn as sns
+    cm = sns.light_palette("green", as_cmap=True)
+    df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())
+
+    return df_results, df_syle
+
+
+# MULTIPLOT
+from string import ascii_letters
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def multiplot(df):
+    """Plots results from df.corr() in a correlation heat map for multicollinearity.
+    Returns fig, ax objects"""
+    sns.set(style="white")
+
+    # Compute the correlation matrix
+    corr = df.corr()
+
+    # Generate a mask for the upper triangle
+    mask = np.zeros_like(corr, dtype=np.bool)
+    mask[np.triu_indices_from(mask)] = True
+
+    # Set up the matplotlib figure
+    f, ax = plt.subplots(figsize=(16, 16))
+
+    # Generate a custom diverging colormap
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+    # Draw the heatmap with the mask and correct aspect ratio
+    sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
+
+    square=True, linewidths=.5, cbar_kws={"shrink": .5})
+    return f, ax
+
+
+
+# Plots histogram and scatter (vs price) side by side
+# Plots histogram and scatter (vs price) side by side
+def plot_hist_scat_sns(df, target='index'):
+    """Plots seaborne distplots and regplots for columns im datamframe vs target.
+
+    Parameters:
+    df (DataFrame): DataFrame.describe() columns will be used. 
+    target = name of column containing target variable.assume first coluumn. 
+
+    Returns:
+    Figures for each column vs target with 2 subplots.
+   """
+    import matplotlib.ticker as mtick
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    with plt.style.context(('dark_background')):
+        ###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------##
+
+
+#         plt.style.use('dark_background')
+        figsize=(9,7)
+
+        # Axis Label fonts
+        fontTitle = {'fontsize': 14,
+                   'fontweight': 'bold',
+                    'fontfamily':'serif'}
+
+        fontAxis = {'fontsize': 12,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontTicks = {'fontsize': 8,
+                   'fontweight':'medium',
+                    'fontfamily':'serif'}
+
+        # Formatting dollar sign labels
+        fmtPrice = '${x:,.0f}'
+        tickPrice = mtick.StrMethodFormatter(fmtPrice)
+
+
+        ###  PLOTTING ----------------------------- ------------------------ ##
+
+        # Loop through dataframe to plot
+        for column in df.describe():
+#             print(f'\nCurrent column: {column}')
+
+            # Create figure with subplots for current column
+            fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
+
+            ##  SUBPLOT 1 --------------------------------------------------##
+            i,j = 0,0
+            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+            # Define graphing keyword dictionaries for distplot (Subplot 1)
+            hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
+            kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
+
+            # Plot distplot on ax[i,j] using hist_kws and kde_kws
+            sns.distplot(df[column], norm_hist=True, kde=True,
+                         hist_kws = hist_kws, kde_kws = kde_kws,
+                         label=column+' histogram', ax=ax[i,j])
+
+
+            # Set x axis label
+            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+            # Get x-ticks, rotate labels, and return
+            xticklab1 = ax[i,j].get_xticklabels(which = 'both')
+            ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
+            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+            # Set y-label 
+            ax[i,j].set_ylabel('Density',fontdict=fontAxis)
+            yticklab1=ax[i,j].get_yticklabels(which='both')
+            ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
+            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+            # Set y-grid
+            ax[i, j].set_axisbelow(True)
+            ax[i, j].grid(axis='y',ls='--')
+
+
+
+
+            ##  SUBPLOT 2-------------------------------------------------- ##
+            i,j = 0,1
+            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+            # Define the kwd dictionaries for scatter and regression line (subplot 2)
+            line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
+            scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
+
+            # Plot regplot on ax[i,j] using line_kws and scatter_kws
+            sns.regplot(df[column], df[target], 
+                        line_kws = line_kws,
+                        scatter_kws = scatter_kws,
+                        ax=ax[i,j])
+
+            # Set x-axis label
+            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+             # Get x ticks, rotate labels, and return
+            xticklab2=ax[i,j].get_xticklabels(which='both')
+            ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
+            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+            # Set  y-axis label
+            ax[i,j].set_ylabel(target,fontdict=fontAxis)
+
+            # Get, set, and format y-axis Price labels
+            yticklab = ax[i,j].get_yticklabels()
+            ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
+            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+    #         ax[i,j].get_yaxis().set_major_formatter(tickPrice) 
+
+            # Set y-grid
+            ax[i, j].set_axisbelow(True)
+            ax[i, j].grid(axis='y',ls='--')       
+
+            ## ---------- Final layout adjustments ----------- ##
+            # Deleted unused subplots 
+            fig.delaxes(ax[1,1])
+            fig.delaxes(ax[1,0])
+
+            # Optimizing spatial layout
+            fig.tight_layout()
+            figtitle=column+'_dist_regr_plots.png'
+#             plt.savefig(figtitle)
+    return 
+
+# Tukey's method using IQR to eliminate 
+def detect_outliers(df, n, features):
+    """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
+    Parameters:
+    df (DataFrame): DataFrane containing columns of features
+    n: default is 0, multiple outlier cutoff  
+
+    Returns:
+    Index of outliers for .loc
+
+    Examples:
+    Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
+    df.loc[Outliers_to_drop] # Show the outliers rows
+    data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
+"""
+
+# Drop outliers    
+
+    outlier_indices = []
+    # iterate over features(columns)
+    for col in features:
+
+        # 1st quartile (25%)
+        Q1 = np.percentile(df[col], 25)
+        # 3rd quartile (75%)
+        Q3 = np.percentile(df[col],75)
+
+        # Interquartile range (IQR)
+        IQR = Q3 - Q1
+        # outlier step
+        outlier_step = 1.5 * IQR
+
+        # Determine a list of indices of outliers for feature col
+        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
+
+        # append the found outlier indices for col to the list of outlier indices 
+        outlier_indices.extend(outlier_list_col)
+
+        # select observations containing more than 2 outliers
+        outlier_indices = Counter(outlier_indices)        
+        multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
+    return multiple_outliers 
+
+
+# describe_outliers -- calls detect_outliers
+def describe_outliers(df):
+    """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
+    """
+    out_count = 0
+    new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
+    for col in df.columns:
+        outies = detect_outliers(df[col])
+        out_count += len(outies) 
+        new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
+    new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
+    return new_df
+
+
+#### Cohen's d
+def Cohen_d(group1, group2):
+    '''Compute Cohen's d.
+    # group1: Series or NumPy array
+    # group2: Series or NumPy array
+    # returns a floating point number 
+    '''
+    diff = group1.mean() - group2.mean()
+
+    n1, n2 = len(group1), len(group2)
+    var1 = group1.var()
+    var2 = group2.var()
+
+    # Calculate the pooled threshold as shown earlier
+    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
+
+    # Calculate Cohen's d statistic
+    d = diff / np.sqrt(pooled_var)
+
+    return d
+
+
+def plot_pdfs(cohen_d=2):
+    """Plot PDFs for distributions that differ by some number of stds.
+
+    cohen_d: number of standard deviations between the means
+    """
+    group1 = scipy.stats.norm(0, 1)
+    group2 = scipy.stats.norm(cohen_d, 1)
+    xs, ys = evaluate_PDF(group1)
+    pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
+
+    xs, ys = evaluate_PDF(group2)
+    pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
+
+    o, s = overlap_superiority(group1, group2)
+    print('overlap', o)
+    print('superiority', s)
+
+def list2df(list):#, sort_values='index'):
+    """ Take in a list where row[0] = column_names and outputs a dataframe.
+
+    Keyword arguments:
+    set_index -- df.set_index(set_index)
+    sortby -- df.sorted()
+    """    
+
+    df_list = pd.DataFrame(list[1:],columns=list[0])
+#     df_list = df_list[1:]
+
+    return df_list
+
+def df_drop_regex(DF, regex_list):
+    '''Use a list of regex to remove columns names. Returns new df.
+
+    Parameters:
+        DF -- input dataframe to remove columns from.
+        regex_list -- list of string patterns or regexp to remove.
+
+    Returns:
+        df_cut -- input df without the dropped columns. 
+        '''
+    df_cut = DF.copy()
+
+    for r in regex_list:
+
+        df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
+        print(f'Removed {r}\n')
+
+    return df_cut
+
+
+
+####### MIKE's PLOTTING
+# plotting order totals per month in violin plots
+
+def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
+
+  '''Plots a violin plot with horizontal mean line, inner stick lines'''
+
+  plt.style.use('dark_background')
+  fig,ax =plt.subplots(figsize=(12,10))
+
+
+  sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
+                 saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
+
+  ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
+  ax.set_xticklabels(ticklabels)
+
+  plt.legend()
+  plt.show()
+  x= df_year_orders['month']
+  y= df_year_orders['order_total']
+  title = 'Order totals per month with or without discounts'
+  hue=df_year_orders['Discount']>0
+
+
+### Example usage
+# #First, declare variables to be plotted
+# x = df_year_orders['month']
+# y = df_year_orders['order_total']
+# ticks = [v for v in month_dict.values()] 
+# title = 'Order totals per month with or without discounts'
+# hue = df_year_orders['Discount']>0
+
+### Then call function
+# make_violinplot(x,y,title,hue, ticks), 
+
+
+###########
+def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
+    '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
+
+    ## ADDING add_gridspec usage
+    import pandas as pd
+    import numpy as np
+    from scipy.stats import sem
+
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    import matplotlib.ticker as ticker
+
+    import seaborn as sns
+
+    from matplotlib import rcParams
+    from matplotlib import rc
+    rcParams['font.family'] = 'serif'
+
+
+
+
+    # Plot distributions of discounted vs full price groups
+    plt.style.use('default')
+    # with plt.style.context(('tableau-colorblind10')):
+    with plt.style.context(('seaborn-notebook')):
+
+
+
+        ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
+       # Axis Label fonts
+        fontSuptitle ={'fontsize': 22,
+                   'fontweight': 'bold',
+                    'fontfamily':'serif'}
+
+        fontTitle = {'fontsize': 10,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontAxis = {'fontsize': 10,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontTicks = {'fontsize': 8,
+                   'fontweight':'medium', 
+                    'fontfamily':'serif'}
+
+
+        ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
+
+        plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
+
+        # Create fig object and declare figsize
+        fig = plt.figure(constrained_layout=True, figsize=(8,3))
+
+
+        # Define gridspec to create grid coordinates             
+        gs = fig.add_gridspec(nrows=1,ncols=10)
+
+        # Assign grid space to ax with add_subplot
+        ax0 = fig.add_subplot(gs[0,0:7])
+        ax1 = fig.add_subplot(gs[0,7:10])
+
+        #Combine into 1 list
+        ax = [ax0,ax1]
+
+        ### ------------------  SUBPLOT 1  ------------------ ###
+
+        ## --------- Defining series1 and 2 for subplot 1------- ##
+        ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
+
+        # Group 1: data, label, hist_kws and kde_kws
+        plotS1 = {'data': series1, 'label': sname1.title(),
+
+                   'hist_kws' :
+                    {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
+
+                   'kde_kws':
+                    {'color':'gray', 'linestyle': '--', 'linewidth':2,
+                     'label':'kde'}}
+
+        # Group 2: data, label, hist_kws and kde_kws
+        plotS2 = {'data': series2,
+                    'label': sname2.title(), 
+
+                    'hist_kws' :
+                    {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
+
+
+                    'kde_kws':
+                    {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
+
+        # plot group 1
+        sns.distplot(plotS1['data'], label=plotS1['label'],
+
+                     hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
+
+                     ax=ax[0])   
+
+
+        # plot group 2
+        sns.distplot(plotS2['data'], label=plotS2['label'],
+
+                     hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
+
+                     ax=ax[0])
+
+
+        ax[0].set_xlabel(series1.name, fontdict=fontAxis)
+        ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
+
+        ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])   
+        ax[0].legend()
+
+
+        ### ------------------  SUBPLOT 2  ------------------ ###
+
+        # Import scipy for error bars
+        from scipy.stats import sem
+
+        # Declare x y group labels(x) and bar heights(y)
+        x = [plotS1['label'], plotS2['label']]
+        y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
+
+        yerr = [sem(plotS1['data']), sem(plotS2['data'])]
+        err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
+
+        # Create the bar plot
+        ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
+
+
+        # Customize subplot 2
+        ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
+        ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
+        ax[1].set_xlabel('')
+
+        ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])
+        ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) 
+
+        ax1=ax[1]
+        test = ax1.get_xticklabels()
+        labels = [x.get_text() for x in test]
+        ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
+
+#         xlab = [x.get_text() for x in xlablist]
+#         ax[1].set_xticklabels(xlab,rotation=45)
+
+#         fig.savefig('H1_EDA_using_gridspec.png')
+#         plt.tight_layout()
+    #     print(f')
+        plt.show()
+
+        return fig,ax
+
+```
+
+
+```python
+
+```
+
+
+
+
+%package help
+Summary:	Development documents and examples for JMI-MVM
+Provides:	python3-JMI-MVM-doc
+%description help
+# JMI_MVM
+
+- A collection of tools created for botcmap. 
+- More information to be added later.
+
+
+<h1>Table of Contents<span class="tocSkip"></span></h1>
+<div class="toc"><ul class="toc-item"></ul></div>
+
+
+```python
+name = "JMI_MVM"
+help_ = " Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"
+#functions.py
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+
+
+def calc_roc_auc(X_test,y_test,dtc,verbose=False):
+    """Tests the results of an already-fit classifer. 
+    Takes X_test, y_test, classifer, verbose (True" print result)
+    Returns the AUC for the roc_curve as a %"""
+    y_pred = dtc.predict(X_test)
+
+    FP_rate, TP_rate, thresh = roc_curve(y_test,y_pred)
+    roc_auc = auc(FP_rate,TP_rate)
+    roc_auc_perc = round(roc_auc*100,3)
+    # Your code here 
+    if verbose:
+        print(f"roc_curve's auc = {roc_auc_perc}%")
+    return roc_auc_perc
+
+def tune_params(param_name, param_values):
+    """Takes in param_name to tune with param_values, plots train vs test AUC's. 
+    Returns df_results and df_style with color coded results"""
+    res_list = [[param_name,'train_roc_auc','test_roc_auc']]
+
+    # Loop through all values in param_values
+    for value in param_values:
+        # Create Model, set params
+        dtc_temp = DecisionTreeClassifier(criterion='entropy')
+        params={param_name:value}
+        dtc_temp.set_params(**params)
+
+        # Fit model
+        dtc_temp.fit(X_train, y_train)
+
+        # Get roc_auc for training data
+        train_roc_auc = calc_roc_auc(X_train,y_train,dtc_temp)
+        # Get roc_auc for test data
+        test_res_roc_auc = calc_roc_auc(X_test,y_test,dtc_temp)
+        # Append value and results to res_list
+        res_list.append([value,train_roc_auc,test_res_roc_auc])
+
+    # Turn results into df_results (basically same as using list2df)
+    df_results = pd.DataFrame(res_list[1:],columns=res_list[0])
+    df_results.set_index(param_name,inplace=True)
+
+    # Plot df_results
+    df_results.plot()
+
+    # Color-coded dataframe s
+    import seaborn as sns
+    cm = sns.light_palette("green", as_cmap=True)
+    df_syle = df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())
+
+    return df_results, df_syle
+
+
+# MULTIPLOT
+from string import ascii_letters
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def multiplot(df):
+    """Plots results from df.corr() in a correlation heat map for multicollinearity.
+    Returns fig, ax objects"""
+    sns.set(style="white")
+
+    # Compute the correlation matrix
+    corr = df.corr()
+
+    # Generate a mask for the upper triangle
+    mask = np.zeros_like(corr, dtype=np.bool)
+    mask[np.triu_indices_from(mask)] = True
+
+    # Set up the matplotlib figure
+    f, ax = plt.subplots(figsize=(16, 16))
+
+    # Generate a custom diverging colormap
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+
+    # Draw the heatmap with the mask and correct aspect ratio
+    sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
+
+    square=True, linewidths=.5, cbar_kws={"shrink": .5})
+    return f, ax
+
+
+
+# Plots histogram and scatter (vs price) side by side
+# Plots histogram and scatter (vs price) side by side
+def plot_hist_scat_sns(df, target='index'):
+    """Plots seaborne distplots and regplots for columns im datamframe vs target.
+
+    Parameters:
+    df (DataFrame): DataFrame.describe() columns will be used. 
+    target = name of column containing target variable.assume first coluumn. 
+
+    Returns:
+    Figures for each column vs target with 2 subplots.
+   """
+    import matplotlib.ticker as mtick
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    with plt.style.context(('dark_background')):
+        ###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------##
+
+
+#         plt.style.use('dark_background')
+        figsize=(9,7)
+
+        # Axis Label fonts
+        fontTitle = {'fontsize': 14,
+                   'fontweight': 'bold',
+                    'fontfamily':'serif'}
+
+        fontAxis = {'fontsize': 12,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontTicks = {'fontsize': 8,
+                   'fontweight':'medium',
+                    'fontfamily':'serif'}
+
+        # Formatting dollar sign labels
+        fmtPrice = '${x:,.0f}'
+        tickPrice = mtick.StrMethodFormatter(fmtPrice)
+
+
+        ###  PLOTTING ----------------------------- ------------------------ ##
+
+        # Loop through dataframe to plot
+        for column in df.describe():
+#             print(f'\nCurrent column: {column}')
+
+            # Create figure with subplots for current column
+            fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)
+
+            ##  SUBPLOT 1 --------------------------------------------------##
+            i,j = 0,0
+            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+            # Define graphing keyword dictionaries for distplot (Subplot 1)
+            hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
+            kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
+
+            # Plot distplot on ax[i,j] using hist_kws and kde_kws
+            sns.distplot(df[column], norm_hist=True, kde=True,
+                         hist_kws = hist_kws, kde_kws = kde_kws,
+                         label=column+' histogram', ax=ax[i,j])
+
+
+            # Set x axis label
+            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+            # Get x-ticks, rotate labels, and return
+            xticklab1 = ax[i,j].get_xticklabels(which = 'both')
+            ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
+            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+            # Set y-label 
+            ax[i,j].set_ylabel('Density',fontdict=fontAxis)
+            yticklab1=ax[i,j].get_yticklabels(which='both')
+            ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
+            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+
+            # Set y-grid
+            ax[i, j].set_axisbelow(True)
+            ax[i, j].grid(axis='y',ls='--')
+
+
+
+
+            ##  SUBPLOT 2-------------------------------------------------- ##
+            i,j = 0,1
+            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
+
+            # Define the kwd dictionaries for scatter and regression line (subplot 2)
+            line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
+            scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
+
+            # Plot regplot on ax[i,j] using line_kws and scatter_kws
+            sns.regplot(df[column], df[target], 
+                        line_kws = line_kws,
+                        scatter_kws = scatter_kws,
+                        ax=ax[i,j])
+
+            # Set x-axis label
+            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
+
+             # Get x ticks, rotate labels, and return
+            xticklab2=ax[i,j].get_xticklabels(which='both')
+            ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
+            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
+
+            # Set  y-axis label
+            ax[i,j].set_ylabel(target,fontdict=fontAxis)
+
+            # Get, set, and format y-axis Price labels
+            yticklab = ax[i,j].get_yticklabels()
+            ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
+            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
+
+    #         ax[i,j].get_yaxis().set_major_formatter(tickPrice) 
+
+            # Set y-grid
+            ax[i, j].set_axisbelow(True)
+            ax[i, j].grid(axis='y',ls='--')       
+
+            ## ---------- Final layout adjustments ----------- ##
+            # Deleted unused subplots 
+            fig.delaxes(ax[1,1])
+            fig.delaxes(ax[1,0])
+
+            # Optimizing spatial layout
+            fig.tight_layout()
+            figtitle=column+'_dist_regr_plots.png'
+#             plt.savefig(figtitle)
+    return 
+
+# Tukey's method using IQR to eliminate 
+def detect_outliers(df, n, features):
+    """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
+    Parameters:
+    df (DataFrame): DataFrane containing columns of features
+    n: default is 0, multiple outlier cutoff  
+
+    Returns:
+    Index of outliers for .loc
+
+    Examples:
+    Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
+    df.loc[Outliers_to_drop] # Show the outliers rows
+    data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
+"""
+
+# Drop outliers    
+
+    outlier_indices = []
+    # iterate over features(columns)
+    for col in features:
+
+        # 1st quartile (25%)
+        Q1 = np.percentile(df[col], 25)
+        # 3rd quartile (75%)
+        Q3 = np.percentile(df[col],75)
+
+        # Interquartile range (IQR)
+        IQR = Q3 - Q1
+        # outlier step
+        outlier_step = 1.5 * IQR
+
+        # Determine a list of indices of outliers for feature col
+        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
+
+        # append the found outlier indices for col to the list of outlier indices 
+        outlier_indices.extend(outlier_list_col)
+
+        # select observations containing more than 2 outliers
+        outlier_indices = Counter(outlier_indices)        
+        multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
+    return multiple_outliers 
+
+
+# describe_outliers -- calls detect_outliers
+def describe_outliers(df):
+    """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
+    """
+    out_count = 0
+    new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
+    for col in df.columns:
+        outies = detect_outliers(df[col])
+        out_count += len(outies) 
+        new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
+    new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
+    return new_df
+
+
+#### Cohen's d
+def Cohen_d(group1, group2):
+    '''Compute Cohen's d.
+    # group1: Series or NumPy array
+    # group2: Series or NumPy array
+    # returns a floating point number 
+    '''
+    diff = group1.mean() - group2.mean()
+
+    n1, n2 = len(group1), len(group2)
+    var1 = group1.var()
+    var2 = group2.var()
+
+    # Calculate the pooled threshold as shown earlier
+    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
+
+    # Calculate Cohen's d statistic
+    d = diff / np.sqrt(pooled_var)
+
+    return d
+
+
+def plot_pdfs(cohen_d=2):
+    """Plot PDFs for distributions that differ by some number of stds.
+
+    cohen_d: number of standard deviations between the means
+    """
+    group1 = scipy.stats.norm(0, 1)
+    group2 = scipy.stats.norm(cohen_d, 1)
+    xs, ys = evaluate_PDF(group1)
+    pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
+
+    xs, ys = evaluate_PDF(group2)
+    pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
+
+    o, s = overlap_superiority(group1, group2)
+    print('overlap', o)
+    print('superiority', s)
+
+def list2df(list):#, sort_values='index'):
+    """ Take in a list where row[0] = column_names and outputs a dataframe.
+
+    Keyword arguments:
+    set_index -- df.set_index(set_index)
+    sortby -- df.sorted()
+    """    
+
+    df_list = pd.DataFrame(list[1:],columns=list[0])
+#     df_list = df_list[1:]
+
+    return df_list
+
+def df_drop_regex(DF, regex_list):
+    '''Use a list of regex to remove columns names. Returns new df.
+
+    Parameters:
+        DF -- input dataframe to remove columns from.
+        regex_list -- list of string patterns or regexp to remove.
+
+    Returns:
+        df_cut -- input df without the dropped columns. 
+        '''
+    df_cut = DF.copy()
+
+    for r in regex_list:
+
+        df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
+        print(f'Removed {r}\n')
+
+    return df_cut
+
+
+
+####### MIKE's PLOTTING
+# plotting order totals per month in violin plots
+
+def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
+
+  '''Plots a violin plot with horizontal mean line, inner stick lines'''
+
+  plt.style.use('dark_background')
+  fig,ax =plt.subplots(figsize=(12,10))
+
+
+  sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
+                 saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)
+
+  ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
+  ax.set_xticklabels(ticklabels)
+
+  plt.legend()
+  plt.show()
+  x= df_year_orders['month']
+  y= df_year_orders['order_total']
+  title = 'Order totals per month with or without discounts'
+  hue=df_year_orders['Discount']>0
+
+
+### Example usage
+# #First, declare variables to be plotted
+# x = df_year_orders['month']
+# y = df_year_orders['order_total']
+# ticks = [v for v in month_dict.values()] 
+# title = 'Order totals per month with or without discounts'
+# hue = df_year_orders['Discount']>0
+
+### Then call function
+# make_violinplot(x,y,title,hue, ticks), 
+
+
+###########
+def plot_wide_kde_thin_bar(series1,sname1, series2, sname2):
+    '''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''
+
+    ## ADDING add_gridspec usage
+    import pandas as pd
+    import numpy as np
+    from scipy.stats import sem
+
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    import matplotlib.ticker as ticker
+
+    import seaborn as sns
+
+    from matplotlib import rcParams
+    from matplotlib import rc
+    rcParams['font.family'] = 'serif'
+
+
+
+
+    # Plot distributions of discounted vs full price groups
+    plt.style.use('default')
+    # with plt.style.context(('tableau-colorblind10')):
+    with plt.style.context(('seaborn-notebook')):
+
+
+
+        ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
+       # Axis Label fonts
+        fontSuptitle ={'fontsize': 22,
+                   'fontweight': 'bold',
+                    'fontfamily':'serif'}
+
+        fontTitle = {'fontsize': 10,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontAxis = {'fontsize': 10,
+                   'fontweight': 'medium',
+                    'fontfamily':'serif'}
+
+        fontTicks = {'fontsize': 8,
+                   'fontweight':'medium', 
+                    'fontfamily':'serif'}
+
+
+        ## --------- CREATE FIG BASED ON GRIDSPEC --------- ##
+
+        plt.suptitle('Quantity of Units Sold', fontdict = fontSuptitle)
+
+        # Create fig object and declare figsize
+        fig = plt.figure(constrained_layout=True, figsize=(8,3))
+
+
+        # Define gridspec to create grid coordinates             
+        gs = fig.add_gridspec(nrows=1,ncols=10)
+
+        # Assign grid space to ax with add_subplot
+        ax0 = fig.add_subplot(gs[0,0:7])
+        ax1 = fig.add_subplot(gs[0,7:10])
+
+        #Combine into 1 list
+        ax = [ax0,ax1]
+
+        ### ------------------  SUBPLOT 1  ------------------ ###
+
+        ## --------- Defining series1 and 2 for subplot 1------- ##
+        ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
+
+        # Group 1: data, label, hist_kws and kde_kws
+        plotS1 = {'data': series1, 'label': sname1.title(),
+
+                   'hist_kws' :
+                    {'edgecolor': 'black', 'color':'darkgray','alpha': 0.8, 'lw':0.5},
+
+                   'kde_kws':
+                    {'color':'gray', 'linestyle': '--', 'linewidth':2,
+                     'label':'kde'}}
+
+        # Group 2: data, label, hist_kws and kde_kws
+        plotS2 = {'data': series2,
+                    'label': sname2.title(), 
+
+                    'hist_kws' :
+                    {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
+
+
+                    'kde_kws':
+                    {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
+
+        # plot group 1
+        sns.distplot(plotS1['data'], label=plotS1['label'],
+
+                     hist_kws = plotS1['hist_kws'], kde_kws = plotS1['kde_kws'],
+
+                     ax=ax[0])   
+
+
+        # plot group 2
+        sns.distplot(plotS2['data'], label=plotS2['label'],
+
+                     hist_kws=plotS2['hist_kws'], kde_kws = plotS2['kde_kws'],
+
+                     ax=ax[0])
+
+
+        ax[0].set_xlabel(series1.name, fontdict=fontAxis)
+        ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
+
+        ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])   
+        ax[0].legend()
+
+
+        ### ------------------  SUBPLOT 2  ------------------ ###
+
+        # Import scipy for error bars
+        from scipy.stats import sem
+
+        # Declare x y group labels(x) and bar heights(y)
+        x = [plotS1['label'], plotS2['label']]
+        y = [np.mean(plotS1['data']), np.mean(plotS2['data'])]
+
+        yerr = [sem(plotS1['data']), sem(plotS2['data'])]
+        err_kws = {'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}
+
+        # Create the bar plot
+        ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws,width=0.6)
+
+
+        # Customize subplot 2
+        ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
+        ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
+        ax[1].set_xlabel('')
+
+        ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])
+        ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize']) 
+
+        ax1=ax[1]
+        test = ax1.get_xticklabels()
+        labels = [x.get_text() for x in test]
+        ax1.set_xticklabels([plotS1['label'],plotS2['label']], rotation=45,ha='center')
+
+#         xlab = [x.get_text() for x in xlablist]
+#         ax[1].set_xticklabels(xlab,rotation=45)
+
+#         fig.savefig('H1_EDA_using_gridspec.png')
+#         plt.tight_layout()
+    #     print(f')
+        plt.show()
+
+        return fig,ax
+
+```
+
+
+```python
+
+```
+
+
+
+
+%prep
+%autosetup -n JMI-MVM-0.3.4
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+	find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+	find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+	find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+	find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+	find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-JMI-MVM -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Wed May 31 2023 Python_Bot <Python_Bot@openeuler.org> - 0.3.4-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..9e6f7f1
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+add6261816be7d1396c08729e51554ca  JMI_MVM-0.3.4.tar.gz
author	CoprDistGit <infra@openeuler.org>	2023-05-31 03:14:01 +0000
committer	CoprDistGit <infra@openeuler.org>	2023-05-31 03:14:01 +0000
commit	95e14620f51ced7d4d299a68b214d6a01b183f92 (patch)
tree	a6c1c0e23853f88aa12e4b9f73e335c3d3bd9f79
parent	3acf606755cc639eec50667e570f263520529812 (diff)