summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--python-bioinfo-tools.spec453
-rw-r--r--sources1
3 files changed, 455 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..2a00462 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/bioinfo_tools-0.3.1.tar.gz
diff --git a/python-bioinfo-tools.spec b/python-bioinfo-tools.spec
new file mode 100644
index 0000000..a816d30
--- /dev/null
+++ b/python-bioinfo-tools.spec
@@ -0,0 +1,453 @@
+%global _empty_manifest_terminate_build 0
+Name: python-bioinfo_tools
+Version: 0.3.1
+Release: 1
+Summary: Python library that parses GFF, Fasta files into python classes
+License: BSD
+URL: https://github.com/sebriois/bioinfo_tools
+Source0: https://mirrors.aliyun.com/pypi/web/packages/0b/79/69b6fa350d0f8074c64a53642ddcf6e8453f1c1ed9d06f636bd70efb5686/bioinfo_tools-0.3.1.tar.gz
+BuildArch: noarch
+
+
+%description
+# bioinfo_tools 0.3.1
+
+## Installation
+
+```bash
+pip install bioinfo_tools
+```
+
+## Parsers
+
+*HEADS UP!* These parsers are still under development and usage is not consistent from one parser to another.
+
+### Fasta parser
+
+```python
+from bioinfo_tools.parsers.fasta import FastaParser
+
+fasta_parser = FastaParser()
+
+# by default, sequence IDs are separated by the firstly found '|' or ':'
+for seqid, sequence in fasta_parser.read("/path/to/file.fasta"):
+ print(seqid, sequence)
+
+# you may specify a specific separator for your sequence ID (e.g white space):
+for seqid, sequence in fasta_parser.read("/path/to/file.fasta", id_separator=" "):
+ print(seqid, sequence)
+```
+
+### GFF parser
+
+```python
+from bioinfo_tools.parsers.gff import Gff3
+
+gff_parser = Gff3()
+with open("/path/to/file.gff", "r") as fh:
+ for gene in gff_parser.read(fh):
+ print(gene)
+
+import gzip
+with gzip.open("/path/to/file.gz", "rb") as fh:
+ for gene in gff_parser.read(fh):
+ print(gene)
+```
+
+### OBO parser
+
+
+```python
+from bioinfo_tools.parsers.obo import OboParser
+
+obo_parser = OboParser()
+with open("/path/to/file.obo") as fh:
+ go_terms = obo_parser.read(fh)
+
+for go_term in go_terms.values():
+ print(go_term)
+
+ # you may also get the GO term parents via the parser
+ parents = obo_parser.get_parents(go_term)
+```
+
+## Usage Examples
+
+### Extract all introns sequences by parsing GFF and fasta files
+
+In this example, we focus on a genome assembly. We will first load a GFF file containing gene annotations for this
+assembly, then load a fastA file containing the nucleic sequences of each chromosome in the genome.
+We will then collect all transcript introns and extract their nucleic sequences.
+
+**__DISCLAIMER__**: for this example to work, your GFF file must expose at least the following feature types in column #3:
+ - `gene`
+ - one of `transcript|mRNA|RNA` (or lowercased version)
+
+
+```python
+from bioinfo_tools.genomic_features.chromosome import Chromosome
+from bioinfo_tools.parsers.gff import Gff3
+from bioinfo_tools.parsers.fasta import FastaParser
+
+chromosomes = dict() # {<chromosome_id>: <bioinfo_tools.genomic_features.Chromosome>}
+
+# start with parsing a GFF file
+gff_parser = Gff3()
+with open("/path/to/gene_models.gff", "r") as fh:
+ for gene in gff_parser.read(fh):
+ chromosome = gene['seqid']
+
+ if chromosome not in chromosomes:
+ chromosomes[chromosome] = Chromosome(chromosome) # init a new Chromosome object
+
+ chromosomes[chromosome].add_gene(gene) # add the current gene to our Chromosome object
+
+# load our chromosome sequences in memory
+fasta_parser = FastaParser()
+for chromosome, nucleic_sequence in fasta_parser.read("/path/to/genome_chromosomes.fasta"):
+ if chromosome not in chromosomes:
+ chromosomes[chromosome] = Chromosome(chromosome)
+ # attach parsed chromosome sequence to our Chromosome object
+ chromosomes[chromosome].attach_nucleic_sequence(nucleic_sequence)
+
+# now, collect introns and extact their nucleic sequence
+introns_sequences = dict() # {<intron_id>: <intron_sequence>}
+for chromosome in chromosomes.values():
+ for gene in chromosome.genes:
+ for transcript in gene.transcripts:
+ for idx, intron in enumerate(transcript.introns):
+ intron_id = "%s_intron_%s" % (transcript.transcript_id, idx)
+ intron_seq = intron.extract(chromosome.nucleic_sequence) # that we attached above
+ introns_sequences[intron_id] = intron_seq
+
+# from here, you can do what you want with the intron sequences (eg. write them to a fasta file, etc)
+# ...
+```
+
+__Note:__ when at the transcript level, you can grab its feature types as described in your GFF file by doing so:
+```python
+for feature in transcript._get_features("exon"):
+ print(feature) # I'm an exon
+```
+For convenience and clarity, following properties are available on transcript objects:
+```python
+print(transcript.introns) # will call transcript._get_features('intron') behind the scenes
+print(transcript.exons) # will call transcript._get_features('exon') behind the scenes
+print(transcript.cds) # will call transcript._get_features('cds') behind the scenes
+print(transcript.polypeptide) # will call transcript._get_features('polypeptide') behind the scenes
+print(transcript.five_prime_utr) # will call transcript._get_features('five_prime_utr') behind the scenes
+print(transcript.three_prime_utr) # will call transcript._get_features('three_prime_utr') behind the scenes
+```
+
+%package -n python3-bioinfo_tools
+Summary: Python library that parses GFF, Fasta files into python classes
+Provides: python-bioinfo_tools
+BuildRequires: python3-devel
+BuildRequires: python3-setuptools
+BuildRequires: python3-pip
+%description -n python3-bioinfo_tools
+# bioinfo_tools 0.3.1
+
+## Installation
+
+```bash
+pip install bioinfo_tools
+```
+
+## Parsers
+
+*HEADS UP!* These parsers are still under development and usage is not consistent from one parser to another.
+
+### Fasta parser
+
+```python
+from bioinfo_tools.parsers.fasta import FastaParser
+
+fasta_parser = FastaParser()
+
+# by default, sequence IDs are separated by the firstly found '|' or ':'
+for seqid, sequence in fasta_parser.read("/path/to/file.fasta"):
+ print(seqid, sequence)
+
+# you may specify a specific separator for your sequence ID (e.g white space):
+for seqid, sequence in fasta_parser.read("/path/to/file.fasta", id_separator=" "):
+ print(seqid, sequence)
+```
+
+### GFF parser
+
+```python
+from bioinfo_tools.parsers.gff import Gff3
+
+gff_parser = Gff3()
+with open("/path/to/file.gff", "r") as fh:
+ for gene in gff_parser.read(fh):
+ print(gene)
+
+import gzip
+with gzip.open("/path/to/file.gz", "rb") as fh:
+ for gene in gff_parser.read(fh):
+ print(gene)
+```
+
+### OBO parser
+
+
+```python
+from bioinfo_tools.parsers.obo import OboParser
+
+obo_parser = OboParser()
+with open("/path/to/file.obo") as fh:
+ go_terms = obo_parser.read(fh)
+
+for go_term in go_terms.values():
+ print(go_term)
+
+ # you may also get the GO term parents via the parser
+ parents = obo_parser.get_parents(go_term)
+```
+
+## Usage Examples
+
+### Extract all introns sequences by parsing GFF and fasta files
+
+In this example, we focus on a genome assembly. We will first load a GFF file containing gene annotations for this
+assembly, then load a fastA file containing the nucleic sequences of each chromosome in the genome.
+We will then collect all transcript introns and extract their nucleic sequences.
+
+**__DISCLAIMER__**: for this example to work, your GFF file must expose at least the following feature types in column #3:
+ - `gene`
+ - one of `transcript|mRNA|RNA` (or lowercased version)
+
+
+```python
+from bioinfo_tools.genomic_features.chromosome import Chromosome
+from bioinfo_tools.parsers.gff import Gff3
+from bioinfo_tools.parsers.fasta import FastaParser
+
+chromosomes = dict() # {<chromosome_id>: <bioinfo_tools.genomic_features.Chromosome>}
+
+# start with parsing a GFF file
+gff_parser = Gff3()
+with open("/path/to/gene_models.gff", "r") as fh:
+ for gene in gff_parser.read(fh):
+ chromosome = gene['seqid']
+
+ if chromosome not in chromosomes:
+ chromosomes[chromosome] = Chromosome(chromosome) # init a new Chromosome object
+
+ chromosomes[chromosome].add_gene(gene) # add the current gene to our Chromosome object
+
+# load our chromosome sequences in memory
+fasta_parser = FastaParser()
+for chromosome, nucleic_sequence in fasta_parser.read("/path/to/genome_chromosomes.fasta"):
+ if chromosome not in chromosomes:
+ chromosomes[chromosome] = Chromosome(chromosome)
+ # attach parsed chromosome sequence to our Chromosome object
+ chromosomes[chromosome].attach_nucleic_sequence(nucleic_sequence)
+
+# now, collect introns and extact their nucleic sequence
+introns_sequences = dict() # {<intron_id>: <intron_sequence>}
+for chromosome in chromosomes.values():
+ for gene in chromosome.genes:
+ for transcript in gene.transcripts:
+ for idx, intron in enumerate(transcript.introns):
+ intron_id = "%s_intron_%s" % (transcript.transcript_id, idx)
+ intron_seq = intron.extract(chromosome.nucleic_sequence) # that we attached above
+ introns_sequences[intron_id] = intron_seq
+
+# from here, you can do what you want with the intron sequences (eg. write them to a fasta file, etc)
+# ...
+```
+
+__Note:__ when at the transcript level, you can grab its feature types as described in your GFF file by doing so:
+```python
+for feature in transcript._get_features("exon"):
+ print(feature) # I'm an exon
+```
+For convenience and clarity, following properties are available on transcript objects:
+```python
+print(transcript.introns) # will call transcript._get_features('intron') behind the scenes
+print(transcript.exons) # will call transcript._get_features('exon') behind the scenes
+print(transcript.cds) # will call transcript._get_features('cds') behind the scenes
+print(transcript.polypeptide) # will call transcript._get_features('polypeptide') behind the scenes
+print(transcript.five_prime_utr) # will call transcript._get_features('five_prime_utr') behind the scenes
+print(transcript.three_prime_utr) # will call transcript._get_features('three_prime_utr') behind the scenes
+```
+
+%package help
+Summary: Development documents and examples for bioinfo_tools
+Provides: python3-bioinfo_tools-doc
+%description help
+# bioinfo_tools 0.3.1
+
+## Installation
+
+```bash
+pip install bioinfo_tools
+```
+
+## Parsers
+
+*HEADS UP!* These parsers are still under development and usage is not consistent from one parser to another.
+
+### Fasta parser
+
+```python
+from bioinfo_tools.parsers.fasta import FastaParser
+
+fasta_parser = FastaParser()
+
+# by default, sequence IDs are separated by the firstly found '|' or ':'
+for seqid, sequence in fasta_parser.read("/path/to/file.fasta"):
+ print(seqid, sequence)
+
+# you may specify a specific separator for your sequence ID (e.g white space):
+for seqid, sequence in fasta_parser.read("/path/to/file.fasta", id_separator=" "):
+ print(seqid, sequence)
+```
+
+### GFF parser
+
+```python
+from bioinfo_tools.parsers.gff import Gff3
+
+gff_parser = Gff3()
+with open("/path/to/file.gff", "r") as fh:
+ for gene in gff_parser.read(fh):
+ print(gene)
+
+import gzip
+with gzip.open("/path/to/file.gz", "rb") as fh:
+ for gene in gff_parser.read(fh):
+ print(gene)
+```
+
+### OBO parser
+
+
+```python
+from bioinfo_tools.parsers.obo import OboParser
+
+obo_parser = OboParser()
+with open("/path/to/file.obo") as fh:
+ go_terms = obo_parser.read(fh)
+
+for go_term in go_terms.values():
+ print(go_term)
+
+ # you may also get the GO term parents via the parser
+ parents = obo_parser.get_parents(go_term)
+```
+
+## Usage Examples
+
+### Extract all introns sequences by parsing GFF and fasta files
+
+In this example, we focus on a genome assembly. We will first load a GFF file containing gene annotations for this
+assembly, then load a fastA file containing the nucleic sequences of each chromosome in the genome.
+We will then collect all transcript introns and extract their nucleic sequences.
+
+**__DISCLAIMER__**: for this example to work, your GFF file must expose at least the following feature types in column #3:
+ - `gene`
+ - one of `transcript|mRNA|RNA` (or lowercased version)
+
+
+```python
+from bioinfo_tools.genomic_features.chromosome import Chromosome
+from bioinfo_tools.parsers.gff import Gff3
+from bioinfo_tools.parsers.fasta import FastaParser
+
+chromosomes = dict() # {<chromosome_id>: <bioinfo_tools.genomic_features.Chromosome>}
+
+# start with parsing a GFF file
+gff_parser = Gff3()
+with open("/path/to/gene_models.gff", "r") as fh:
+ for gene in gff_parser.read(fh):
+ chromosome = gene['seqid']
+
+ if chromosome not in chromosomes:
+ chromosomes[chromosome] = Chromosome(chromosome) # init a new Chromosome object
+
+ chromosomes[chromosome].add_gene(gene) # add the current gene to our Chromosome object
+
+# load our chromosome sequences in memory
+fasta_parser = FastaParser()
+for chromosome, nucleic_sequence in fasta_parser.read("/path/to/genome_chromosomes.fasta"):
+ if chromosome not in chromosomes:
+ chromosomes[chromosome] = Chromosome(chromosome)
+ # attach parsed chromosome sequence to our Chromosome object
+ chromosomes[chromosome].attach_nucleic_sequence(nucleic_sequence)
+
+# now, collect introns and extact their nucleic sequence
+introns_sequences = dict() # {<intron_id>: <intron_sequence>}
+for chromosome in chromosomes.values():
+ for gene in chromosome.genes:
+ for transcript in gene.transcripts:
+ for idx, intron in enumerate(transcript.introns):
+ intron_id = "%s_intron_%s" % (transcript.transcript_id, idx)
+ intron_seq = intron.extract(chromosome.nucleic_sequence) # that we attached above
+ introns_sequences[intron_id] = intron_seq
+
+# from here, you can do what you want with the intron sequences (eg. write them to a fasta file, etc)
+# ...
+```
+
+__Note:__ when at the transcript level, you can grab its feature types as described in your GFF file by doing so:
+```python
+for feature in transcript._get_features("exon"):
+ print(feature) # I'm an exon
+```
+For convenience and clarity, following properties are available on transcript objects:
+```python
+print(transcript.introns) # will call transcript._get_features('intron') behind the scenes
+print(transcript.exons) # will call transcript._get_features('exon') behind the scenes
+print(transcript.cds) # will call transcript._get_features('cds') behind the scenes
+print(transcript.polypeptide) # will call transcript._get_features('polypeptide') behind the scenes
+print(transcript.five_prime_utr) # will call transcript._get_features('five_prime_utr') behind the scenes
+print(transcript.three_prime_utr) # will call transcript._get_features('three_prime_utr') behind the scenes
+```
+
+%prep
+%autosetup -n bioinfo_tools-0.3.1
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+ find usr/lib -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+ find usr/lib64 -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+ find usr/bin -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+ find usr/sbin -type f -printf "\"/%h/%f\"\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+ find usr/share/man -type f -printf "\"/%h/%f.gz\"\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-bioinfo_tools -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Tue Jun 20 2023 Python_Bot <Python_Bot@openeuler.org> - 0.3.1-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..2ebebe3
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+79029c907c72764db0974acd8d9cf02a bioinfo_tools-0.3.1.tar.gz