%global _empty_manifest_terminate_build 0 Name: python-trr265 Version: 0.0.10 Release: 1 Summary: The TRR 265 analysis pipeline. License: Apache Software License 2.0 URL: https://github.com/hgzech/trr265/tree/master/ Source0: https://mirrors.aliyun.com/pypi/web/packages/8a/46/2ef06d6533e1645d9360c4c2fb86b4040b238e30808da1fa57d88d820769/trr265-0.0.10.tar.gz BuildArch: noarch Requires: python3-pip Requires: python3-packaging Requires: python3-bs4 Requires: python3-pandas Requires: python3-numpy Requires: python3-pyarrow Requires: python3-openpyxl Requires: python3-lxml Requires: python3-matplotlib Requires: python3-seaborn Requires: python3-fastcore %description # TRR 265 > This module handles analysis of the TRR265 data. ## Install `pip install trr265` `pip install biuR` (optional but needed for most analyses) ## How to use ```python from pygments.formatters import HtmlFormatter from pygments import highlight import IPython import inspect from pygments.lexers import PythonLexer def display_function(the_function): formatter = HtmlFormatter() return IPython.display.HTML('{}'.format( formatter.get_style_defs('.highlight'), highlight(inspect.getsource(the_function), PythonLexer(), formatter))) ``` ```python display_function(dp.get_mov_data) ```

@patch
@get_efficiently
def get_mov_data(self:DataProvider):
    """
    This function gets Movisense data
    1) We create unique participnat IDs (e.g. "b001"; this is necessary as sites use overapping IDs)
    2) We merge double IDs, so participants with two IDs only have one (for this duplicate_ids.csv has to be updated)
    3) We remove pilot participants
    4) We get starting dates (from the participant overviews in movisense; downloaded as html)
    5) We calculate sampling days and end dates based on the starting dates
    """
    # Loading raw data
    mov_berlin = pd.read_csv(self.mov_berlin_path, sep = ';')
    mov_dresden = pd.read_csv(self.mov_dresden_path, sep = ';')
    mov_mannheim = pd.read_csv(self.mov_mannheim_path, sep = ';')

    # Merging (participant numbers repeat so we add the first letter of location)
    mov_berlin['location'] = 'berlin'
    mov_dresden['location'] = 'dresden'
    mov_mannheim['location'] = 'mannheim'
    df = pd.concat([mov_berlin,mov_dresden,mov_mannheim])
    df['participant'] =  df['location'].str[0] + df.Participant.apply(lambda x: '%03d'%int(x))
    df.drop(columns = 'Participant', inplace = True) # Dropping old participant column to avoid mistakes
    df['trigger_date'] = pd.to_datetime(df.Trigger_date + ' ' + df.Trigger_time)

    # Merging double IDs (for participants with several movisense IDs)
    df['participant'] = df.participant.replace(self.get_duplicate_mov_ids())

    # Removing pilot participants
    df = df[~df.participant.astype(str).str.contains('test')]
    df = df[~df.participant.isin(['m157', 'b010', 'b006', 'd001', 'd002', 'd042', 'm024', 'm028', 'm071', 'm079', 'm107'])]


    # Adding starting dates to get sampling days
    def get_starting_dates(path, pp_prefix = ''):
        soup = bs(open(path).read())
        ids = [int(x.text) for x in soup.find_all("td", class_ = 'simpleId')]
        c_dates = [x.find_all("span")[0]['title'] for x in soup.find_all("td", class_ = 'coupleDate')]
        s_dates = [x['value'] for x in soup.find_all("input", class_ = 'dp startDate')]
        df = pd.DataFrame({'participant':ids,'coupling_date':c_dates,'starting_date':s_dates})
        df['coupling_date'] = pd.to_datetime(df.coupling_date)
        df['starting_date'] = pd.to_datetime(df.starting_date)
        df.starting_date.fillna(df.coupling_date,inplace = True)
        df['participant'] = pp_prefix + df.participant.apply(lambda x: '%03d'%int(x))
        return df

    starting_dates = pd.concat([
    get_starting_dates(self.mov_berlin_starting_dates_path, 'b'),
    get_starting_dates(self.mov_dresden_starting_dates_path, 'd'),
    get_starting_dates(self.mov_mannheim_starting_dates_path, 'm')
    ])
    # For participants with several movisense IDs we use the first coupling date
    starting_dates.participant.replace(self.get_duplicate_mov_ids(), inplace = True)
    starting_dates = starting_dates.groupby('participant')[['starting_date','coupling_date']].min().reset_index()
    df = df.merge(starting_dates, on="participant", how = 'left', indicator = True)
    # Checking if starting dates were downloaded
    if "left_only" in df._merge.unique():
        no_starting_dates = df.query('_merge == "left_only"').participant.unique()
        print("Starting dates missing for participants below.  Did you download the participant overviews as html?", no_starting_dates)
    # Calculating movisense sampling day, adding date and end_date
    df['sampling_day'] = (df['trigger_date'] - df['starting_date']).dt.days + 1
    df['date'] = df.trigger_date.dt.date
    df['end_date'] = df.date + pd.DateOffset(days = 365)
    df.index.rename('mov_index',inplace = True)
    # Adding redcap IDs
    ids_table = self.get_ba_data()[['participant_id','mov_id']].query('mov_id==mov_id').groupby('mov_id').first()
    ids_table.columns = ['redcap_id']
    df = df.merge(ids_table, left_on='participant', right_index = True, how = 'left')
    # Filtering out participants with no associated redcap data
    no_redcap = df.query("redcap_id.isna()").participant.unique()
    print("Participants: %s have no associated redcap IDs and are excluded from the following analyses."%', '.join(no_redcap))
    df = df[df.redcap_id.isna()==False]
    return df

```python #%load_ext autoreload #%autoreload 2 from trr265.data_provider import DataProvider dp = DataProvider('/Users/hilmarzech/Projects/trr265/trr265/data/') # Path to data folder (containing raw, interim, external, and processed) dp.get_two_day_data().iloc[:20][['participant','date','MDBF_zufrieden','g_alc']] ```

	participant	date	MDBF_zufrieden	g_alc
two_day_index
0	b001	2020-02-22	NaN	6.4
1	b001	2020-02-23	NaN	35.2
2	b001	2020-02-24	2.0	NaN
3	b001	2020-02-25	NaN	NaN
4	b001	2020-02-26	NaN	NaN
5	b001	2020-02-27	NaN	NaN
6	b001	2020-02-28	NaN	NaN
7	b001	2020-02-29	NaN	NaN
8	b001	2020-03-01	NaN	NaN
9	b001	2020-03-02	NaN	NaN
10	b001	2020-03-03	NaN	NaN
11	b001	2020-03-04	NaN	NaN
12	b001	2020-03-05	NaN	0.0
13	b001	2020-03-06	NaN	57.6
14	b001	2020-03-07	3.0	NaN
15	b001	2020-03-08	NaN	NaN
16	b001	2020-03-09	NaN	NaN
17	b001	2020-03-10	NaN	NaN
18	b001	2020-03-11	NaN	NaN
19	b001	2020-03-12	NaN	NaN

## Required data ### Phone screening - data/external/b7_participants.xlsx <- from Hilmar - data/raw/phonescreening.csv <- from redcap - data/external/phone_codebook.html <- from redcap ### Basic assessment (from redcap) - data/raw/ba.csv <- from redcap - data/external/ba_codebook.html <- from redcap ### Movisens - all basic assessment data (see above) - data/raw/mov_data_b.csv - data/raw/mov_data_d.csv - data/raw/mov_data_m.csv - data/raw/starting_dates_b.csv - data/raw/starting_dates_d.csv - data/raw/starting_dates_m.csv - data/external/alcohol_per_drink.csv <- from Hilmar %package -n python3-trr265 Summary: The TRR 265 analysis pipeline. Provides: python-trr265 BuildRequires: python3-devel BuildRequires: python3-setuptools BuildRequires: python3-pip %description -n python3-trr265 # TRR 265 > This module handles analysis of the TRR265 data. ## Install `pip install trr265` `pip install biuR` (optional but needed for most analyses) ## How to use ```python from pygments.formatters import HtmlFormatter from pygments import highlight import IPython import inspect from pygments.lexers import PythonLexer def display_function(the_function): formatter = HtmlFormatter() return IPython.display.HTML('{}'.format( formatter.get_style_defs('.highlight'), highlight(inspect.getsource(the_function), PythonLexer(), formatter))) ``` ```python display_function(dp.get_mov_data) ```

@patch
@get_efficiently
def get_mov_data(self:DataProvider):
    """
    This function gets Movisense data
    1) We create unique participnat IDs (e.g. "b001"; this is necessary as sites use overapping IDs)
    2) We merge double IDs, so participants with two IDs only have one (for this duplicate_ids.csv has to be updated)
    3) We remove pilot participants
    4) We get starting dates (from the participant overviews in movisense; downloaded as html)
    5) We calculate sampling days and end dates based on the starting dates
    """
    # Loading raw data
    mov_berlin = pd.read_csv(self.mov_berlin_path, sep = ';')
    mov_dresden = pd.read_csv(self.mov_dresden_path, sep = ';')
    mov_mannheim = pd.read_csv(self.mov_mannheim_path, sep = ';')

    # Merging (participant numbers repeat so we add the first letter of location)
    mov_berlin['location'] = 'berlin'
    mov_dresden['location'] = 'dresden'
    mov_mannheim['location'] = 'mannheim'
    df = pd.concat([mov_berlin,mov_dresden,mov_mannheim])
    df['participant'] =  df['location'].str[0] + df.Participant.apply(lambda x: '%03d'%int(x))
    df.drop(columns = 'Participant', inplace = True) # Dropping old participant column to avoid mistakes
    df['trigger_date'] = pd.to_datetime(df.Trigger_date + ' ' + df.Trigger_time)

    # Merging double IDs (for participants with several movisense IDs)
    df['participant'] = df.participant.replace(self.get_duplicate_mov_ids())

    # Removing pilot participants
    df = df[~df.participant.astype(str).str.contains('test')]
    df = df[~df.participant.isin(['m157', 'b010', 'b006', 'd001', 'd002', 'd042', 'm024', 'm028', 'm071', 'm079', 'm107'])]


    # Adding starting dates to get sampling days
    def get_starting_dates(path, pp_prefix = ''):
        soup = bs(open(path).read())
        ids = [int(x.text) for x in soup.find_all("td", class_ = 'simpleId')]
        c_dates = [x.find_all("span")[0]['title'] for x in soup.find_all("td", class_ = 'coupleDate')]
        s_dates = [x['value'] for x in soup.find_all("input", class_ = 'dp startDate')]
        df = pd.DataFrame({'participant':ids,'coupling_date':c_dates,'starting_date':s_dates})
        df['coupling_date'] = pd.to_datetime(df.coupling_date)
        df['starting_date'] = pd.to_datetime(df.starting_date)
        df.starting_date.fillna(df.coupling_date,inplace = True)
        df['participant'] = pp_prefix + df.participant.apply(lambda x: '%03d'%int(x))
        return df

    starting_dates = pd.concat([
    get_starting_dates(self.mov_berlin_starting_dates_path, 'b'),
    get_starting_dates(self.mov_dresden_starting_dates_path, 'd'),
    get_starting_dates(self.mov_mannheim_starting_dates_path, 'm')
    ])
    # For participants with several movisense IDs we use the first coupling date
    starting_dates.participant.replace(self.get_duplicate_mov_ids(), inplace = True)
    starting_dates = starting_dates.groupby('participant')[['starting_date','coupling_date']].min().reset_index()
    df = df.merge(starting_dates, on="participant", how = 'left', indicator = True)
    # Checking if starting dates were downloaded
    if "left_only" in df._merge.unique():
        no_starting_dates = df.query('_merge == "left_only"').participant.unique()
        print("Starting dates missing for participants below.  Did you download the participant overviews as html?", no_starting_dates)
    # Calculating movisense sampling day, adding date and end_date
    df['sampling_day'] = (df['trigger_date'] - df['starting_date']).dt.days + 1
    df['date'] = df.trigger_date.dt.date
    df['end_date'] = df.date + pd.DateOffset(days = 365)
    df.index.rename('mov_index',inplace = True)
    # Adding redcap IDs
    ids_table = self.get_ba_data()[['participant_id','mov_id']].query('mov_id==mov_id').groupby('mov_id').first()
    ids_table.columns = ['redcap_id']
    df = df.merge(ids_table, left_on='participant', right_index = True, how = 'left')
    # Filtering out participants with no associated redcap data
    no_redcap = df.query("redcap_id.isna()").participant.unique()
    print("Participants: %s have no associated redcap IDs and are excluded from the following analyses."%', '.join(no_redcap))
    df = df[df.redcap_id.isna()==False]
    return df

	participant	date	MDBF_zufrieden	g_alc
two_day_index
0	b001	2020-02-22	NaN	6.4
1	b001	2020-02-23	NaN	35.2
2	b001	2020-02-24	2.0	NaN
3	b001	2020-02-25	NaN	NaN
4	b001	2020-02-26	NaN	NaN
5	b001	2020-02-27	NaN	NaN
6	b001	2020-02-28	NaN	NaN
7	b001	2020-02-29	NaN	NaN
8	b001	2020-03-01	NaN	NaN
9	b001	2020-03-02	NaN	NaN
10	b001	2020-03-03	NaN	NaN
11	b001	2020-03-04	NaN	NaN
12	b001	2020-03-05	NaN	0.0
13	b001	2020-03-06	NaN	57.6
14	b001	2020-03-07	3.0	NaN
15	b001	2020-03-08	NaN	NaN
16	b001	2020-03-09	NaN	NaN
17	b001	2020-03-10	NaN	NaN
18	b001	2020-03-11	NaN	NaN
19	b001	2020-03-12	NaN	NaN

## Required data ### Phone screening - data/external/b7_participants.xlsx <- from Hilmar - data/raw/phonescreening.csv <- from redcap - data/external/phone_codebook.html <- from redcap ### Basic assessment (from redcap) - data/raw/ba.csv <- from redcap - data/external/ba_codebook.html <- from redcap ### Movisens - all basic assessment data (see above) - data/raw/mov_data_b.csv - data/raw/mov_data_d.csv - data/raw/mov_data_m.csv - data/raw/starting_dates_b.csv - data/raw/starting_dates_d.csv - data/raw/starting_dates_m.csv - data/external/alcohol_per_drink.csv <- from Hilmar %package help Summary: Development documents and examples for trr265 Provides: python3-trr265-doc %description help # TRR 265 > This module handles analysis of the TRR265 data. ## Install `pip install trr265` `pip install biuR` (optional but needed for most analyses) ## How to use ```python from pygments.formatters import HtmlFormatter from pygments import highlight import IPython import inspect from pygments.lexers import PythonLexer def display_function(the_function): formatter = HtmlFormatter() return IPython.display.HTML('{}'.format( formatter.get_style_defs('.highlight'), highlight(inspect.getsource(the_function), PythonLexer(), formatter))) ``` ```python display_function(dp.get_mov_data) ```

@patch
@get_efficiently
def get_mov_data(self:DataProvider):
    """
    This function gets Movisense data
    1) We create unique participnat IDs (e.g. "b001"; this is necessary as sites use overapping IDs)
    2) We merge double IDs, so participants with two IDs only have one (for this duplicate_ids.csv has to be updated)
    3) We remove pilot participants
    4) We get starting dates (from the participant overviews in movisense; downloaded as html)
    5) We calculate sampling days and end dates based on the starting dates
    """
    # Loading raw data
    mov_berlin = pd.read_csv(self.mov_berlin_path, sep = ';')
    mov_dresden = pd.read_csv(self.mov_dresden_path, sep = ';')
    mov_mannheim = pd.read_csv(self.mov_mannheim_path, sep = ';')

    # Merging (participant numbers repeat so we add the first letter of location)
    mov_berlin['location'] = 'berlin'
    mov_dresden['location'] = 'dresden'
    mov_mannheim['location'] = 'mannheim'
    df = pd.concat([mov_berlin,mov_dresden,mov_mannheim])
    df['participant'] =  df['location'].str[0] + df.Participant.apply(lambda x: '%03d'%int(x))
    df.drop(columns = 'Participant', inplace = True) # Dropping old participant column to avoid mistakes
    df['trigger_date'] = pd.to_datetime(df.Trigger_date + ' ' + df.Trigger_time)

    # Merging double IDs (for participants with several movisense IDs)
    df['participant'] = df.participant.replace(self.get_duplicate_mov_ids())

    # Removing pilot participants
    df = df[~df.participant.astype(str).str.contains('test')]
    df = df[~df.participant.isin(['m157', 'b010', 'b006', 'd001', 'd002', 'd042', 'm024', 'm028', 'm071', 'm079', 'm107'])]


    # Adding starting dates to get sampling days
    def get_starting_dates(path, pp_prefix = ''):
        soup = bs(open(path).read())
        ids = [int(x.text) for x in soup.find_all("td", class_ = 'simpleId')]
        c_dates = [x.find_all("span")[0]['title'] for x in soup.find_all("td", class_ = 'coupleDate')]
        s_dates = [x['value'] for x in soup.find_all("input", class_ = 'dp startDate')]
        df = pd.DataFrame({'participant':ids,'coupling_date':c_dates,'starting_date':s_dates})
        df['coupling_date'] = pd.to_datetime(df.coupling_date)
        df['starting_date'] = pd.to_datetime(df.starting_date)
        df.starting_date.fillna(df.coupling_date,inplace = True)
        df['participant'] = pp_prefix + df.participant.apply(lambda x: '%03d'%int(x))
        return df

    starting_dates = pd.concat([
    get_starting_dates(self.mov_berlin_starting_dates_path, 'b'),
    get_starting_dates(self.mov_dresden_starting_dates_path, 'd'),
    get_starting_dates(self.mov_mannheim_starting_dates_path, 'm')
    ])
    # For participants with several movisense IDs we use the first coupling date
    starting_dates.participant.replace(self.get_duplicate_mov_ids(), inplace = True)
    starting_dates = starting_dates.groupby('participant')[['starting_date','coupling_date']].min().reset_index()
    df = df.merge(starting_dates, on="participant", how = 'left', indicator = True)
    # Checking if starting dates were downloaded
    if "left_only" in df._merge.unique():
        no_starting_dates = df.query('_merge == "left_only"').participant.unique()
        print("Starting dates missing for participants below.  Did you download the participant overviews as html?", no_starting_dates)
    # Calculating movisense sampling day, adding date and end_date
    df['sampling_day'] = (df['trigger_date'] - df['starting_date']).dt.days + 1
    df['date'] = df.trigger_date.dt.date
    df['end_date'] = df.date + pd.DateOffset(days = 365)
    df.index.rename('mov_index',inplace = True)
    # Adding redcap IDs
    ids_table = self.get_ba_data()[['participant_id','mov_id']].query('mov_id==mov_id').groupby('mov_id').first()
    ids_table.columns = ['redcap_id']
    df = df.merge(ids_table, left_on='participant', right_index = True, how = 'left')
    # Filtering out participants with no associated redcap data
    no_redcap = df.query("redcap_id.isna()").participant.unique()
    print("Participants: %s have no associated redcap IDs and are excluded from the following analyses."%', '.join(no_redcap))
    df = df[df.redcap_id.isna()==False]
    return df

	participant	date	MDBF_zufrieden	g_alc
two_day_index
0	b001	2020-02-22	NaN	6.4
1	b001	2020-02-23	NaN	35.2
2	b001	2020-02-24	2.0	NaN
3	b001	2020-02-25	NaN	NaN
4	b001	2020-02-26	NaN	NaN
5	b001	2020-02-27	NaN	NaN
6	b001	2020-02-28	NaN	NaN
7	b001	2020-02-29	NaN	NaN
8	b001	2020-03-01	NaN	NaN
9	b001	2020-03-02	NaN	NaN
10	b001	2020-03-03	NaN	NaN
11	b001	2020-03-04	NaN	NaN
12	b001	2020-03-05	NaN	0.0
13	b001	2020-03-06	NaN	57.6
14	b001	2020-03-07	3.0	NaN
15	b001	2020-03-08	NaN	NaN
16	b001	2020-03-09	NaN	NaN
17	b001	2020-03-10	NaN	NaN
18	b001	2020-03-11	NaN	NaN
19	b001	2020-03-12	NaN	NaN

## Required data ### Phone screening - data/external/b7_participants.xlsx <- from Hilmar - data/raw/phonescreening.csv <- from redcap - data/external/phone_codebook.html <- from redcap ### Basic assessment (from redcap) - data/raw/ba.csv <- from redcap - data/external/ba_codebook.html <- from redcap ### Movisens - all basic assessment data (see above) - data/raw/mov_data_b.csv - data/raw/mov_data_d.csv - data/raw/mov_data_m.csv - data/raw/starting_dates_b.csv - data/raw/starting_dates_d.csv - data/raw/starting_dates_m.csv - data/external/alcohol_per_drink.csv <- from Hilmar %prep %autosetup -n trr265-0.0.10 %build %py3_build %install %py3_install install -d -m755 %{buildroot}/%{_pkgdocdir} if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi pushd %{buildroot} if [ -d usr/lib ]; then find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst fi if [ -d usr/lib64 ]; then find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst fi if [ -d usr/bin ]; then find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst fi if [ -d usr/sbin ]; then find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst fi touch doclist.lst if [ -d usr/share/man ]; then find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst fi popd mv %{buildroot}/filelist.lst . mv %{buildroot}/doclist.lst . %files -n python3-trr265 -f filelist.lst %dir %{python3_sitelib}/* %files help -f doclist.lst %{_docdir}/* %changelog * Fri Jun 09 2023 Python_Bot - 0.0.10-1 - Package Spec generated