diff options
Diffstat (limited to 'python-bgen.spec')
-rw-r--r-- | python-bgen.spec | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/python-bgen.spec b/python-bgen.spec new file mode 100644 index 0000000..10cfecd --- /dev/null +++ b/python-bgen.spec @@ -0,0 +1,369 @@ +%global _empty_manifest_terminate_build 0 +Name: python-bgen +Version: 1.5.4 +Release: 1 +Summary: Package for loading data from bgen files +License: MIT +URL: https://github.com/jeremymcrae/bgen +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/af/8c/03f4e9e6372e87eaad219c190acd240f0051135b1a3263d04339eba1b2b4/bgen-1.5.4.tar.gz + +Requires: python3-numpy + +%description +### Another bgen reader + + +This is a package for reading [bgen files](https://www.well.ox.ac.uk/~gav/bgen_format). + +This package uses cython to wrap c++ code for parsing bgen files. It's fairly +quick, it can parse genotypes from 500,000 individuals at ~300 variants per +second within a single python process (~450 million probabilities per second +with a 3GHz CPU). Decompressing the genotype probabilities is the slow step, +zlib decompression takes 80% of the total time, using zstd compressed genotypes +would be much faster, maybe 2-3X faster? + +This has been optimized for UKBiobank bgen files (i.e. bgen version 1.2 with +zlib compressed 8-bit genotype probabilities, but the other bgen versions and +zstd compression have also been tested using example bgen files). + +#### Install +`pip install bgen` + +#### Usage +```python +from bgen import BgenReader + +bfile = BgenReader(BGEN_PATH) +rsids = bfile.rsids() + +# select a variant by indexing +var = bfile[1000] + +# pull out genotype probabilities +probs = var.probabilities # returns 2D numpy array +dosage = var.minor_allele_dosage # returns 1D numpy array for biallelic variant + +# iterate through every variant in the file +with BgenReader(BGEN_PATH, delay_parsing=True) as bfile: + for var in bfile: + dosage = var.minor_allele_dosage + +# get all variants in a genomic region +variants = bfile.fetch('21', 10000, 5000000) + +# or for writing bgen files +import numpy as np +from bgen import BgenWriter + +geno = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]).astype(np.float64) +with BgenWriter(BGEN_PATH, n_samples=3) as bfile: + bfile.add_variant(varid='var1', rsid='rs1', chrom='chr1', pos=1, + alleles=['A', 'G'], genotypes=geno) +``` + +#### API documentation + +``` py +class BgenReader(path, sample_path='', delay_parsing=False) + # opens a bgen file. If a bgenix index exists for the file, the index file + # will be opened automatically for quicker access of specific variants. + Arguments: + path: path to bgen file + sample_path: optional path to sample file. Samples will be given integer IDs + if sample file is not given and sample IDs not found in the bgen file + delay_parsing: True/False option to allow for not loading all variants into + memory when the BgenFile is opened. This can save time when iterating + across variants in the file + + Attributes: + samples: list of sample IDs + header: BgenHeader with info about the bgen version and compression. + + Methods: + slicing: BgenVars can be accessed by slicing the BgenFile e.g. bfile[1000] + iteration: variants in a BgenFile can be looped over e.g. for x in bfile: print(x) + fetch(chrom, start=None, stop=None): get all variants within a genomic region + drop_variants(list[int]): drops variants by index from being used in analyses + with_rsid(rsid): returns BgenVar with given position + at_position(pos): returns BgenVar with given rsid + varids(): returns list of varids for variants in the bgen file. + rsids(): returns list of rsids for variants in the bgen file. + chroms(): returns list of chromosomes for variants in the bgen file. + positions(): returns list of positions for variants in the bgen file. + +class BgenVar(handle, offset, layout, compression, n_samples): + # Note: this isn't called directly, but instead returned from BgenFile methods + Attributes: + varid: ID for variant + rsid: reference SNP ID for variant + chrom: chromosome variant is on + pos: nucleotide position variant is at + alleles: list of alleles for variant + is_phased: True/False for whether variant has phased genotype data + ploidy: list of ploidy for each sample. Samples are ordered as per BgenFile.samples + minor_allele: the least common allele (for biallelic variants) + minor_allele_dosage: 1D numpy array of minor allele dosages for each sample + alt_dosage: 1D numpy array of alt allele dosages for each sample + probabilitiies: 2D numpy array of genotype probabilities, one sample per row + + BgenVars can be pickled e.g. pickle.dumps(var) +``` + + +%package -n python3-bgen +Summary: Package for loading data from bgen files +Provides: python-bgen +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +BuildRequires: python3-cffi +BuildRequires: gcc +BuildRequires: gdb +%description -n python3-bgen +### Another bgen reader + + +This is a package for reading [bgen files](https://www.well.ox.ac.uk/~gav/bgen_format). + +This package uses cython to wrap c++ code for parsing bgen files. It's fairly +quick, it can parse genotypes from 500,000 individuals at ~300 variants per +second within a single python process (~450 million probabilities per second +with a 3GHz CPU). Decompressing the genotype probabilities is the slow step, +zlib decompression takes 80% of the total time, using zstd compressed genotypes +would be much faster, maybe 2-3X faster? + +This has been optimized for UKBiobank bgen files (i.e. bgen version 1.2 with +zlib compressed 8-bit genotype probabilities, but the other bgen versions and +zstd compression have also been tested using example bgen files). + +#### Install +`pip install bgen` + +#### Usage +```python +from bgen import BgenReader + +bfile = BgenReader(BGEN_PATH) +rsids = bfile.rsids() + +# select a variant by indexing +var = bfile[1000] + +# pull out genotype probabilities +probs = var.probabilities # returns 2D numpy array +dosage = var.minor_allele_dosage # returns 1D numpy array for biallelic variant + +# iterate through every variant in the file +with BgenReader(BGEN_PATH, delay_parsing=True) as bfile: + for var in bfile: + dosage = var.minor_allele_dosage + +# get all variants in a genomic region +variants = bfile.fetch('21', 10000, 5000000) + +# or for writing bgen files +import numpy as np +from bgen import BgenWriter + +geno = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]).astype(np.float64) +with BgenWriter(BGEN_PATH, n_samples=3) as bfile: + bfile.add_variant(varid='var1', rsid='rs1', chrom='chr1', pos=1, + alleles=['A', 'G'], genotypes=geno) +``` + +#### API documentation + +``` py +class BgenReader(path, sample_path='', delay_parsing=False) + # opens a bgen file. If a bgenix index exists for the file, the index file + # will be opened automatically for quicker access of specific variants. + Arguments: + path: path to bgen file + sample_path: optional path to sample file. Samples will be given integer IDs + if sample file is not given and sample IDs not found in the bgen file + delay_parsing: True/False option to allow for not loading all variants into + memory when the BgenFile is opened. This can save time when iterating + across variants in the file + + Attributes: + samples: list of sample IDs + header: BgenHeader with info about the bgen version and compression. + + Methods: + slicing: BgenVars can be accessed by slicing the BgenFile e.g. bfile[1000] + iteration: variants in a BgenFile can be looped over e.g. for x in bfile: print(x) + fetch(chrom, start=None, stop=None): get all variants within a genomic region + drop_variants(list[int]): drops variants by index from being used in analyses + with_rsid(rsid): returns BgenVar with given position + at_position(pos): returns BgenVar with given rsid + varids(): returns list of varids for variants in the bgen file. + rsids(): returns list of rsids for variants in the bgen file. + chroms(): returns list of chromosomes for variants in the bgen file. + positions(): returns list of positions for variants in the bgen file. + +class BgenVar(handle, offset, layout, compression, n_samples): + # Note: this isn't called directly, but instead returned from BgenFile methods + Attributes: + varid: ID for variant + rsid: reference SNP ID for variant + chrom: chromosome variant is on + pos: nucleotide position variant is at + alleles: list of alleles for variant + is_phased: True/False for whether variant has phased genotype data + ploidy: list of ploidy for each sample. Samples are ordered as per BgenFile.samples + minor_allele: the least common allele (for biallelic variants) + minor_allele_dosage: 1D numpy array of minor allele dosages for each sample + alt_dosage: 1D numpy array of alt allele dosages for each sample + probabilitiies: 2D numpy array of genotype probabilities, one sample per row + + BgenVars can be pickled e.g. pickle.dumps(var) +``` + + +%package help +Summary: Development documents and examples for bgen +Provides: python3-bgen-doc +%description help +### Another bgen reader + + +This is a package for reading [bgen files](https://www.well.ox.ac.uk/~gav/bgen_format). + +This package uses cython to wrap c++ code for parsing bgen files. It's fairly +quick, it can parse genotypes from 500,000 individuals at ~300 variants per +second within a single python process (~450 million probabilities per second +with a 3GHz CPU). Decompressing the genotype probabilities is the slow step, +zlib decompression takes 80% of the total time, using zstd compressed genotypes +would be much faster, maybe 2-3X faster? + +This has been optimized for UKBiobank bgen files (i.e. bgen version 1.2 with +zlib compressed 8-bit genotype probabilities, but the other bgen versions and +zstd compression have also been tested using example bgen files). + +#### Install +`pip install bgen` + +#### Usage +```python +from bgen import BgenReader + +bfile = BgenReader(BGEN_PATH) +rsids = bfile.rsids() + +# select a variant by indexing +var = bfile[1000] + +# pull out genotype probabilities +probs = var.probabilities # returns 2D numpy array +dosage = var.minor_allele_dosage # returns 1D numpy array for biallelic variant + +# iterate through every variant in the file +with BgenReader(BGEN_PATH, delay_parsing=True) as bfile: + for var in bfile: + dosage = var.minor_allele_dosage + +# get all variants in a genomic region +variants = bfile.fetch('21', 10000, 5000000) + +# or for writing bgen files +import numpy as np +from bgen import BgenWriter + +geno = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]).astype(np.float64) +with BgenWriter(BGEN_PATH, n_samples=3) as bfile: + bfile.add_variant(varid='var1', rsid='rs1', chrom='chr1', pos=1, + alleles=['A', 'G'], genotypes=geno) +``` + +#### API documentation + +``` py +class BgenReader(path, sample_path='', delay_parsing=False) + # opens a bgen file. If a bgenix index exists for the file, the index file + # will be opened automatically for quicker access of specific variants. + Arguments: + path: path to bgen file + sample_path: optional path to sample file. Samples will be given integer IDs + if sample file is not given and sample IDs not found in the bgen file + delay_parsing: True/False option to allow for not loading all variants into + memory when the BgenFile is opened. This can save time when iterating + across variants in the file + + Attributes: + samples: list of sample IDs + header: BgenHeader with info about the bgen version and compression. + + Methods: + slicing: BgenVars can be accessed by slicing the BgenFile e.g. bfile[1000] + iteration: variants in a BgenFile can be looped over e.g. for x in bfile: print(x) + fetch(chrom, start=None, stop=None): get all variants within a genomic region + drop_variants(list[int]): drops variants by index from being used in analyses + with_rsid(rsid): returns BgenVar with given position + at_position(pos): returns BgenVar with given rsid + varids(): returns list of varids for variants in the bgen file. + rsids(): returns list of rsids for variants in the bgen file. + chroms(): returns list of chromosomes for variants in the bgen file. + positions(): returns list of positions for variants in the bgen file. + +class BgenVar(handle, offset, layout, compression, n_samples): + # Note: this isn't called directly, but instead returned from BgenFile methods + Attributes: + varid: ID for variant + rsid: reference SNP ID for variant + chrom: chromosome variant is on + pos: nucleotide position variant is at + alleles: list of alleles for variant + is_phased: True/False for whether variant has phased genotype data + ploidy: list of ploidy for each sample. Samples are ordered as per BgenFile.samples + minor_allele: the least common allele (for biallelic variants) + minor_allele_dosage: 1D numpy array of minor allele dosages for each sample + alt_dosage: 1D numpy array of alt allele dosages for each sample + probabilitiies: 2D numpy array of genotype probabilities, one sample per row + + BgenVars can be pickled e.g. pickle.dumps(var) +``` + + +%prep +%autosetup -n bgen-1.5.4 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-bgen -f filelist.lst +%dir %{python3_sitearch}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 10 2023 Python_Bot <Python_Bot@openeuler.org> - 1.5.4-1 +- Package Spec generated |