%global _empty_manifest_terminate_build 0
Name: python-trr265
Version: 0.0.10
Release: 1
Summary: The TRR 265 analysis pipeline.
License: Apache Software License 2.0
URL: https://github.com/hgzech/trr265/tree/master/
Source0: https://mirrors.aliyun.com/pypi/web/packages/8a/46/2ef06d6533e1645d9360c4c2fb86b4040b238e30808da1fa57d88d820769/trr265-0.0.10.tar.gz
BuildArch: noarch
Requires: python3-pip
Requires: python3-packaging
Requires: python3-bs4
Requires: python3-pandas
Requires: python3-numpy
Requires: python3-pyarrow
Requires: python3-openpyxl
Requires: python3-lxml
Requires: python3-matplotlib
Requires: python3-seaborn
Requires: python3-fastcore
%description
# TRR 265
> This module handles analysis of the TRR265 data.
## Install
`pip install trr265`
`pip install biuR` (optional but needed for most analyses)
## How to use
```python
from pygments.formatters import HtmlFormatter
from pygments import highlight
import IPython
import inspect
from pygments.lexers import PythonLexer
def display_function(the_function):
formatter = HtmlFormatter()
return IPython.display.HTML('{}'.format(
formatter.get_style_defs('.highlight'),
highlight(inspect.getsource(the_function), PythonLexer(), formatter)))
```
```python
display_function(dp.get_mov_data)
```
@patch
@get_efficiently
def get_mov_data(self:DataProvider):
"""
This function gets Movisense data
1) We create unique participnat IDs (e.g. "b001"; this is necessary as sites use overapping IDs)
2) We merge double IDs, so participants with two IDs only have one (for this duplicate_ids.csv has to be updated)
3) We remove pilot participants
4) We get starting dates (from the participant overviews in movisense; downloaded as html)
5) We calculate sampling days and end dates based on the starting dates
"""
# Loading raw data
mov_berlin = pd.read_csv(self.mov_berlin_path, sep = ';')
mov_dresden = pd.read_csv(self.mov_dresden_path, sep = ';')
mov_mannheim = pd.read_csv(self.mov_mannheim_path, sep = ';')
# Merging (participant numbers repeat so we add the first letter of location)
mov_berlin['location'] = 'berlin'
mov_dresden['location'] = 'dresden'
mov_mannheim['location'] = 'mannheim'
df = pd.concat([mov_berlin,mov_dresden,mov_mannheim])
df['participant'] = df['location'].str[0] + df.Participant.apply(lambda x: '%03d'%int(x))
df.drop(columns = 'Participant', inplace = True) # Dropping old participant column to avoid mistakes
df['trigger_date'] = pd.to_datetime(df.Trigger_date + ' ' + df.Trigger_time)
# Merging double IDs (for participants with several movisense IDs)
df['participant'] = df.participant.replace(self.get_duplicate_mov_ids())
# Removing pilot participants
df = df[~df.participant.astype(str).str.contains('test')]
df = df[~df.participant.isin(['m157', 'b010', 'b006', 'd001', 'd002', 'd042', 'm024', 'm028', 'm071', 'm079', 'm107'])]
# Adding starting dates to get sampling days
def get_starting_dates(path, pp_prefix = ''):
soup = bs(open(path).read())
ids = [int(x.text) for x in soup.find_all("td", class_ = 'simpleId')]
c_dates = [x.find_all("span")[0]['title'] for x in soup.find_all("td", class_ = 'coupleDate')]
s_dates = [x['value'] for x in soup.find_all("input", class_ = 'dp startDate')]
df = pd.DataFrame({'participant':ids,'coupling_date':c_dates,'starting_date':s_dates})
df['coupling_date'] = pd.to_datetime(df.coupling_date)
df['starting_date'] = pd.to_datetime(df.starting_date)
df.starting_date.fillna(df.coupling_date,inplace = True)
df['participant'] = pp_prefix + df.participant.apply(lambda x: '%03d'%int(x))
return df
starting_dates = pd.concat([
get_starting_dates(self.mov_berlin_starting_dates_path, 'b'),
get_starting_dates(self.mov_dresden_starting_dates_path, 'd'),
get_starting_dates(self.mov_mannheim_starting_dates_path, 'm')
])
# For participants with several movisense IDs we use the first coupling date
starting_dates.participant.replace(self.get_duplicate_mov_ids(), inplace = True)
starting_dates = starting_dates.groupby('participant')[['starting_date','coupling_date']].min().reset_index()
df = df.merge(starting_dates, on="participant", how = 'left', indicator = True)
# Checking if starting dates were downloaded
if "left_only" in df._merge.unique():
no_starting_dates = df.query('_merge == "left_only"').participant.unique()
print("Starting dates missing for participants below. Did you download the participant overviews as html?", no_starting_dates)
# Calculating movisense sampling day, adding date and end_date
df['sampling_day'] = (df['trigger_date'] - df['starting_date']).dt.days + 1
df['date'] = df.trigger_date.dt.date
df['end_date'] = df.date + pd.DateOffset(days = 365)
df.index.rename('mov_index',inplace = True)
# Adding redcap IDs
ids_table = self.get_ba_data()[['participant_id','mov_id']].query('mov_id==mov_id').groupby('mov_id').first()
ids_table.columns = ['redcap_id']
df = df.merge(ids_table, left_on='participant', right_index = True, how = 'left')
# Filtering out participants with no associated redcap data
no_redcap = df.query("redcap_id.isna()").participant.unique()
print("Participants: %s have no associated redcap IDs and are excluded from the following analyses."%', '.join(no_redcap))
df = df[df.redcap_id.isna()==False]
return df
```python
#%load_ext autoreload
#%autoreload 2
from trr265.data_provider import DataProvider
dp = DataProvider('/Users/hilmarzech/Projects/trr265/trr265/data/') # Path to data folder (containing raw, interim, external, and processed)
dp.get_two_day_data().iloc[:20][['participant','date','MDBF_zufrieden','g_alc']]
```
|
participant |
date |
MDBF_zufrieden |
g_alc |
two_day_index |
|
|
|
|
0 |
b001 |
2020-02-22 |
NaN |
6.4 |
1 |
b001 |
2020-02-23 |
NaN |
35.2 |
2 |
b001 |
2020-02-24 |
2.0 |
NaN |
3 |
b001 |
2020-02-25 |
NaN |
NaN |
4 |
b001 |
2020-02-26 |
NaN |
NaN |
5 |
b001 |
2020-02-27 |
NaN |
NaN |
6 |
b001 |
2020-02-28 |
NaN |
NaN |
7 |
b001 |
2020-02-29 |
NaN |
NaN |
8 |
b001 |
2020-03-01 |
NaN |
NaN |
9 |
b001 |
2020-03-02 |
NaN |
NaN |
10 |
b001 |
2020-03-03 |
NaN |
NaN |
11 |
b001 |
2020-03-04 |
NaN |
NaN |
12 |
b001 |
2020-03-05 |
NaN |
0.0 |
13 |
b001 |
2020-03-06 |
NaN |
57.6 |
14 |
b001 |
2020-03-07 |
3.0 |
NaN |
15 |
b001 |
2020-03-08 |
NaN |
NaN |
16 |
b001 |
2020-03-09 |
NaN |
NaN |
17 |
b001 |
2020-03-10 |
NaN |
NaN |
18 |
b001 |
2020-03-11 |
NaN |
NaN |
19 |
b001 |
2020-03-12 |
NaN |
NaN |
## Required data
### Phone screening
- data/external/b7_participants.xlsx <- from Hilmar
- data/raw/phonescreening.csv <- from redcap
- data/external/phone_codebook.html <- from redcap
### Basic assessment (from redcap)
- data/raw/ba.csv <- from redcap
- data/external/ba_codebook.html <- from redcap
### Movisens
- all basic assessment data (see above)
- data/raw/mov_data_b.csv
- data/raw/mov_data_d.csv
- data/raw/mov_data_m.csv
- data/raw/starting_dates_b.csv
- data/raw/starting_dates_d.csv
- data/raw/starting_dates_m.csv
- data/external/alcohol_per_drink.csv <- from Hilmar
%package -n python3-trr265
Summary: The TRR 265 analysis pipeline.
Provides: python-trr265
BuildRequires: python3-devel
BuildRequires: python3-setuptools
BuildRequires: python3-pip
%description -n python3-trr265
# TRR 265
> This module handles analysis of the TRR265 data.
## Install
`pip install trr265`
`pip install biuR` (optional but needed for most analyses)
## How to use
```python
from pygments.formatters import HtmlFormatter
from pygments import highlight
import IPython
import inspect
from pygments.lexers import PythonLexer
def display_function(the_function):
formatter = HtmlFormatter()
return IPython.display.HTML('{}'.format(
formatter.get_style_defs('.highlight'),
highlight(inspect.getsource(the_function), PythonLexer(), formatter)))
```
```python
display_function(dp.get_mov_data)
```
@patch
@get_efficiently
def get_mov_data(self:DataProvider):
"""
This function gets Movisense data
1) We create unique participnat IDs (e.g. "b001"; this is necessary as sites use overapping IDs)
2) We merge double IDs, so participants with two IDs only have one (for this duplicate_ids.csv has to be updated)
3) We remove pilot participants
4) We get starting dates (from the participant overviews in movisense; downloaded as html)
5) We calculate sampling days and end dates based on the starting dates
"""
# Loading raw data
mov_berlin = pd.read_csv(self.mov_berlin_path, sep = ';')
mov_dresden = pd.read_csv(self.mov_dresden_path, sep = ';')
mov_mannheim = pd.read_csv(self.mov_mannheim_path, sep = ';')
# Merging (participant numbers repeat so we add the first letter of location)
mov_berlin['location'] = 'berlin'
mov_dresden['location'] = 'dresden'
mov_mannheim['location'] = 'mannheim'
df = pd.concat([mov_berlin,mov_dresden,mov_mannheim])
df['participant'] = df['location'].str[0] + df.Participant.apply(lambda x: '%03d'%int(x))
df.drop(columns = 'Participant', inplace = True) # Dropping old participant column to avoid mistakes
df['trigger_date'] = pd.to_datetime(df.Trigger_date + ' ' + df.Trigger_time)
# Merging double IDs (for participants with several movisense IDs)
df['participant'] = df.participant.replace(self.get_duplicate_mov_ids())
# Removing pilot participants
df = df[~df.participant.astype(str).str.contains('test')]
df = df[~df.participant.isin(['m157', 'b010', 'b006', 'd001', 'd002', 'd042', 'm024', 'm028', 'm071', 'm079', 'm107'])]
# Adding starting dates to get sampling days
def get_starting_dates(path, pp_prefix = ''):
soup = bs(open(path).read())
ids = [int(x.text) for x in soup.find_all("td", class_ = 'simpleId')]
c_dates = [x.find_all("span")[0]['title'] for x in soup.find_all("td", class_ = 'coupleDate')]
s_dates = [x['value'] for x in soup.find_all("input", class_ = 'dp startDate')]
df = pd.DataFrame({'participant':ids,'coupling_date':c_dates,'starting_date':s_dates})
df['coupling_date'] = pd.to_datetime(df.coupling_date)
df['starting_date'] = pd.to_datetime(df.starting_date)
df.starting_date.fillna(df.coupling_date,inplace = True)
df['participant'] = pp_prefix + df.participant.apply(lambda x: '%03d'%int(x))
return df
starting_dates = pd.concat([
get_starting_dates(self.mov_berlin_starting_dates_path, 'b'),
get_starting_dates(self.mov_dresden_starting_dates_path, 'd'),
get_starting_dates(self.mov_mannheim_starting_dates_path, 'm')
])
# For participants with several movisense IDs we use the first coupling date
starting_dates.participant.replace(self.get_duplicate_mov_ids(), inplace = True)
starting_dates = starting_dates.groupby('participant')[['starting_date','coupling_date']].min().reset_index()
df = df.merge(starting_dates, on="participant", how = 'left', indicator = True)
# Checking if starting dates were downloaded
if "left_only" in df._merge.unique():
no_starting_dates = df.query('_merge == "left_only"').participant.unique()
print("Starting dates missing for participants below. Did you download the participant overviews as html?", no_starting_dates)
# Calculating movisense sampling day, adding date and end_date
df['sampling_day'] = (df['trigger_date'] - df['starting_date']).dt.days + 1
df['date'] = df.trigger_date.dt.date
df['end_date'] = df.date + pd.DateOffset(days = 365)
df.index.rename('mov_index',inplace = True)
# Adding redcap IDs
ids_table = self.get_ba_data()[['participant_id','mov_id']].query('mov_id==mov_id').groupby('mov_id').first()
ids_table.columns = ['redcap_id']
df = df.merge(ids_table, left_on='participant', right_index = True, how = 'left')
# Filtering out participants with no associated redcap data
no_redcap = df.query("redcap_id.isna()").participant.unique()
print("Participants: %s have no associated redcap IDs and are excluded from the following analyses."%', '.join(no_redcap))
df = df[df.redcap_id.isna()==False]
return df
```python
#%load_ext autoreload
#%autoreload 2
from trr265.data_provider import DataProvider
dp = DataProvider('/Users/hilmarzech/Projects/trr265/trr265/data/') # Path to data folder (containing raw, interim, external, and processed)
dp.get_two_day_data().iloc[:20][['participant','date','MDBF_zufrieden','g_alc']]
```
|
participant |
date |
MDBF_zufrieden |
g_alc |
two_day_index |
|
|
|
|
0 |
b001 |
2020-02-22 |
NaN |
6.4 |
1 |
b001 |
2020-02-23 |
NaN |
35.2 |
2 |
b001 |
2020-02-24 |
2.0 |
NaN |
3 |
b001 |
2020-02-25 |
NaN |
NaN |
4 |
b001 |
2020-02-26 |
NaN |
NaN |
5 |
b001 |
2020-02-27 |
NaN |
NaN |
6 |
b001 |
2020-02-28 |
NaN |
NaN |
7 |
b001 |
2020-02-29 |
NaN |
NaN |
8 |
b001 |
2020-03-01 |
NaN |
NaN |
9 |
b001 |
2020-03-02 |
NaN |
NaN |
10 |
b001 |
2020-03-03 |
NaN |
NaN |
11 |
b001 |
2020-03-04 |
NaN |
NaN |
12 |
b001 |
2020-03-05 |
NaN |
0.0 |
13 |
b001 |
2020-03-06 |
NaN |
57.6 |
14 |
b001 |
2020-03-07 |
3.0 |
NaN |
15 |
b001 |
2020-03-08 |
NaN |
NaN |
16 |
b001 |
2020-03-09 |
NaN |
NaN |
17 |
b001 |
2020-03-10 |
NaN |
NaN |
18 |
b001 |
2020-03-11 |
NaN |
NaN |
19 |
b001 |
2020-03-12 |
NaN |
NaN |
## Required data
### Phone screening
- data/external/b7_participants.xlsx <- from Hilmar
- data/raw/phonescreening.csv <- from redcap
- data/external/phone_codebook.html <- from redcap
### Basic assessment (from redcap)
- data/raw/ba.csv <- from redcap
- data/external/ba_codebook.html <- from redcap
### Movisens
- all basic assessment data (see above)
- data/raw/mov_data_b.csv
- data/raw/mov_data_d.csv
- data/raw/mov_data_m.csv
- data/raw/starting_dates_b.csv
- data/raw/starting_dates_d.csv
- data/raw/starting_dates_m.csv
- data/external/alcohol_per_drink.csv <- from Hilmar
%package help
Summary: Development documents and examples for trr265
Provides: python3-trr265-doc
%description help
# TRR 265
> This module handles analysis of the TRR265 data.
## Install
`pip install trr265`
`pip install biuR` (optional but needed for most analyses)
## How to use
```python
from pygments.formatters import HtmlFormatter
from pygments import highlight
import IPython
import inspect
from pygments.lexers import PythonLexer
def display_function(the_function):
formatter = HtmlFormatter()
return IPython.display.HTML('{}'.format(
formatter.get_style_defs('.highlight'),
highlight(inspect.getsource(the_function), PythonLexer(), formatter)))
```
```python
display_function(dp.get_mov_data)
```
@patch
@get_efficiently
def get_mov_data(self:DataProvider):
"""
This function gets Movisense data
1) We create unique participnat IDs (e.g. "b001"; this is necessary as sites use overapping IDs)
2) We merge double IDs, so participants with two IDs only have one (for this duplicate_ids.csv has to be updated)
3) We remove pilot participants
4) We get starting dates (from the participant overviews in movisense; downloaded as html)
5) We calculate sampling days and end dates based on the starting dates
"""
# Loading raw data
mov_berlin = pd.read_csv(self.mov_berlin_path, sep = ';')
mov_dresden = pd.read_csv(self.mov_dresden_path, sep = ';')
mov_mannheim = pd.read_csv(self.mov_mannheim_path, sep = ';')
# Merging (participant numbers repeat so we add the first letter of location)
mov_berlin['location'] = 'berlin'
mov_dresden['location'] = 'dresden'
mov_mannheim['location'] = 'mannheim'
df = pd.concat([mov_berlin,mov_dresden,mov_mannheim])
df['participant'] = df['location'].str[0] + df.Participant.apply(lambda x: '%03d'%int(x))
df.drop(columns = 'Participant', inplace = True) # Dropping old participant column to avoid mistakes
df['trigger_date'] = pd.to_datetime(df.Trigger_date + ' ' + df.Trigger_time)
# Merging double IDs (for participants with several movisense IDs)
df['participant'] = df.participant.replace(self.get_duplicate_mov_ids())
# Removing pilot participants
df = df[~df.participant.astype(str).str.contains('test')]
df = df[~df.participant.isin(['m157', 'b010', 'b006', 'd001', 'd002', 'd042', 'm024', 'm028', 'm071', 'm079', 'm107'])]
# Adding starting dates to get sampling days
def get_starting_dates(path, pp_prefix = ''):
soup = bs(open(path).read())
ids = [int(x.text) for x in soup.find_all("td", class_ = 'simpleId')]
c_dates = [x.find_all("span")[0]['title'] for x in soup.find_all("td", class_ = 'coupleDate')]
s_dates = [x['value'] for x in soup.find_all("input", class_ = 'dp startDate')]
df = pd.DataFrame({'participant':ids,'coupling_date':c_dates,'starting_date':s_dates})
df['coupling_date'] = pd.to_datetime(df.coupling_date)
df['starting_date'] = pd.to_datetime(df.starting_date)
df.starting_date.fillna(df.coupling_date,inplace = True)
df['participant'] = pp_prefix + df.participant.apply(lambda x: '%03d'%int(x))
return df
starting_dates = pd.concat([
get_starting_dates(self.mov_berlin_starting_dates_path, 'b'),
get_starting_dates(self.mov_dresden_starting_dates_path, 'd'),
get_starting_dates(self.mov_mannheim_starting_dates_path, 'm')
])
# For participants with several movisense IDs we use the first coupling date
starting_dates.participant.replace(self.get_duplicate_mov_ids(), inplace = True)
starting_dates = starting_dates.groupby('participant')[['starting_date','coupling_date']].min().reset_index()
df = df.merge(starting_dates, on="participant", how = 'left', indicator = True)
# Checking if starting dates were downloaded
if "left_only" in df._merge.unique():
no_starting_dates = df.query('_merge == "left_only"').participant.unique()
print("Starting dates missing for participants below. Did you download the participant overviews as html?", no_starting_dates)
# Calculating movisense sampling day, adding date and end_date
df['sampling_day'] = (df['trigger_date'] - df['starting_date']).dt.days + 1
df['date'] = df.trigger_date.dt.date
df['end_date'] = df.date + pd.DateOffset(days = 365)
df.index.rename('mov_index',inplace = True)
# Adding redcap IDs
ids_table = self.get_ba_data()[['participant_id','mov_id']].query('mov_id==mov_id').groupby('mov_id').first()
ids_table.columns = ['redcap_id']
df = df.merge(ids_table, left_on='participant', right_index = True, how = 'left')
# Filtering out participants with no associated redcap data
no_redcap = df.query("redcap_id.isna()").participant.unique()
print("Participants: %s have no associated redcap IDs and are excluded from the following analyses."%', '.join(no_redcap))
df = df[df.redcap_id.isna()==False]
return df
```python
#%load_ext autoreload
#%autoreload 2
from trr265.data_provider import DataProvider
dp = DataProvider('/Users/hilmarzech/Projects/trr265/trr265/data/') # Path to data folder (containing raw, interim, external, and processed)
dp.get_two_day_data().iloc[:20][['participant','date','MDBF_zufrieden','g_alc']]
```
|
participant |
date |
MDBF_zufrieden |
g_alc |
two_day_index |
|
|
|
|
0 |
b001 |
2020-02-22 |
NaN |
6.4 |
1 |
b001 |
2020-02-23 |
NaN |
35.2 |
2 |
b001 |
2020-02-24 |
2.0 |
NaN |
3 |
b001 |
2020-02-25 |
NaN |
NaN |
4 |
b001 |
2020-02-26 |
NaN |
NaN |
5 |
b001 |
2020-02-27 |
NaN |
NaN |
6 |
b001 |
2020-02-28 |
NaN |
NaN |
7 |
b001 |
2020-02-29 |
NaN |
NaN |
8 |
b001 |
2020-03-01 |
NaN |
NaN |
9 |
b001 |
2020-03-02 |
NaN |
NaN |
10 |
b001 |
2020-03-03 |
NaN |
NaN |
11 |
b001 |
2020-03-04 |
NaN |
NaN |
12 |
b001 |
2020-03-05 |
NaN |
0.0 |
13 |
b001 |
2020-03-06 |
NaN |
57.6 |
14 |
b001 |
2020-03-07 |
3.0 |
NaN |
15 |
b001 |
2020-03-08 |
NaN |
NaN |
16 |
b001 |
2020-03-09 |
NaN |
NaN |
17 |
b001 |
2020-03-10 |
NaN |
NaN |
18 |
b001 |
2020-03-11 |
NaN |
NaN |
19 |
b001 |
2020-03-12 |
NaN |
NaN |
## Required data
### Phone screening
- data/external/b7_participants.xlsx <- from Hilmar
- data/raw/phonescreening.csv <- from redcap
- data/external/phone_codebook.html <- from redcap
### Basic assessment (from redcap)
- data/raw/ba.csv <- from redcap
- data/external/ba_codebook.html <- from redcap
### Movisens
- all basic assessment data (see above)
- data/raw/mov_data_b.csv
- data/raw/mov_data_d.csv
- data/raw/mov_data_m.csv
- data/raw/starting_dates_b.csv
- data/raw/starting_dates_d.csv
- data/raw/starting_dates_m.csv
- data/external/alcohol_per_drink.csv <- from Hilmar
%prep
%autosetup -n trr265-0.0.10
%build
%py3_build
%install
%py3_install
install -d -m755 %{buildroot}/%{_pkgdocdir}
if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
pushd %{buildroot}
if [ -d usr/lib ]; then
find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst
fi
if [ -d usr/lib64 ]; then
find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst
fi
if [ -d usr/bin ]; then
find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst
fi
if [ -d usr/sbin ]; then
find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst
fi
touch doclist.lst
if [ -d usr/share/man ]; then
find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst
fi
popd
mv %{buildroot}/filelist.lst .
mv %{buildroot}/doclist.lst .
%files -n python3-trr265 -f filelist.lst
%dir %{python3_sitelib}/*
%files help -f doclist.lst
%{_docdir}/*
%changelog
* Fri Jun 09 2023 Python_Bot - 0.0.10-1
- Package Spec generated