From adc8c1b0614f8722308891d918b38b4803b11936 Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Wed, 17 May 2023 03:59:09 +0000 Subject: automatic import of python-table-ocr --- python-table-ocr.spec | 553 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 python-table-ocr.spec (limited to 'python-table-ocr.spec') diff --git a/python-table-ocr.spec b/python-table-ocr.spec new file mode 100644 index 0000000..af372fe --- /dev/null +++ b/python-table-ocr.spec @@ -0,0 +1,553 @@ +%global _empty_manifest_terminate_build 0 +Name: python-table-ocr +Version: 0.2.5 +Release: 1 +Summary: Extract text from tables in images. +License: MIT License +URL: https://github.com/eihli/image-table-ocr +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/0c/80/6825837bd2f8c4d49a19f77ed71106f8635205719b2df476dcf544c27f26/table_ocr-0.2.5.tar.gz +BuildArch: noarch + +Requires: python3-pytesseract +Requires: python3-opencv-python +Requires: python3-numpy +Requires: python3-requests + +%description +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + TABLE DETECTION IN IMAGES AND OCR TO CSV + + Eric Ihli + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + +Table of Contents +───────────────── + +1. Overview +2. Requirements +3. Demo +4. Modules + + + + + +1 Overview +══════════ + + This python package contains modules to help with finding and + extracting tabular data from a PDF or image into a CSV format. + + Given an image that contains a table… + + + + Extract the the text into a CSV format… + + ┌──── + │ PRIZE,ODDS 1 IN:,# OF WINNERS* + │ $3,9.09,"282,447" + │ $5,16.66,"154,097" + │ $7,40.01,"64,169" + │ $10,26.67,"96,283" + │ $20,100.00,"25,677" + │ $30,290.83,"8,829" + │ $50,239.66,"10,714" + │ $100,919.66,"2,792" + │ $500,"6,652.07",386 + │ "$40,000","855,899.99",3 + │ 1,i223, + │ Toa,, + │ ,, + │ ,,"* Based upon 2,567,700" + └──── + + +2 Requirements +══════════════ + + Along with the python requirements that are listed in setup.py and + that are automatically installed when installing this package through + pip, there are a few external requirements for some of the modules. + + I haven’t looked into the minimum required versions of these + dependencies, but I’ll list the versions that I’m using. + + • `pdfimages' 20.09.0 of [Poppler] + • `tesseract' 5.0.0 of [Tesseract] + • `mogrify' 7.0.10 of [ImageMagick] + + +[Poppler] + +[Tesseract] + +[ImageMagick] + + +3 Demo +══════ + + There is a demo module that will download an image given a URL and try + to extract tables from the image and process the cells into a CSV. You + can try it out with one of the images included in this repo. + + 1. `pip3 install table_ocr' + 2. `python3 -m table_ocr.demo + https://raw.githubusercontent.com/eihli/image-table-ocr/master/resources/test_data/simple.png' + + That will run against the following image: + + + + The following should be printed to your terminal after running the + above commands. + + ┌──── + │ Running `extract_tables.main([/tmp/demo_p9on6m8o/simple.png]).` + │ Extracted the following tables from the image: + │ [('/tmp/demo_p9on6m8o/simple.png', ['/tmp/demo_p9on6m8o/simple/table-000.png'])] + │ Processing tables for /tmp/demo_p9on6m8o/simple.png. + │ Processing table /tmp/demo_p9on6m8o/simple/table-000.png. + │ Extracted 18 cells from /tmp/demo_p9on6m8o/simple/table-000.png + │ Cells: + │ /tmp/demo_p9on6m8o/simple/cells/000-000.png: Cell + │ /tmp/demo_p9on6m8o/simple/cells/000-001.png: Format + │ /tmp/demo_p9on6m8o/simple/cells/000-002.png: Formula + │ ... + │ + │ Here is the entire CSV output: + │ + │ Cell,Format,Formula + │ B4,Percentage,None + │ C4,General,None + │ D4,Accounting,None + │ E4,Currency,"=PMT(B4/12,C4,D4)" + │ F4,Currency,=E4*C4 + └──── + + +4 Modules +═════════ + + The package is split into modules with narrow focuses. + + • `pdf_to_images' uses Poppler and ImageMagick to extract images from + a PDF. + • `extract_tables' finds and extracts table-looking things from an + image. + • `extract_cells' extracts and orders cells from a table. + • `ocr_image' uses Tesseract to OCR the text from an image of a cell. + • `ocr_to_csv' converts into a CSV the directory structure that + `ocr_image' outputs. + + The outputs of a previous module can be used by a subsequent module so + that they can be chained together to create the entire workflow, as + demonstrated by the following shell script. + + ┌──── + │ #!/bin/sh + │ + │ PDF=$1 + │ + │ python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt + │ cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt + │ cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt + │ cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} + │ + │ for image in $(cat /tmp/extracted-tables.txt); do + │ dir=$(dirname $image) + │ python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt") + │ done + └──── + + + The package was written in a [literate programming] style. The source + code at + + is meant to act as the documentation and reference material. + + +[literate programming] + + + + + +%package -n python3-table-ocr +Summary: Extract text from tables in images. +Provides: python-table-ocr +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-table-ocr +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + TABLE DETECTION IN IMAGES AND OCR TO CSV + + Eric Ihli + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + +Table of Contents +───────────────── + +1. Overview +2. Requirements +3. Demo +4. Modules + + + + + +1 Overview +══════════ + + This python package contains modules to help with finding and + extracting tabular data from a PDF or image into a CSV format. + + Given an image that contains a table… + + + + Extract the the text into a CSV format… + + ┌──── + │ PRIZE,ODDS 1 IN:,# OF WINNERS* + │ $3,9.09,"282,447" + │ $5,16.66,"154,097" + │ $7,40.01,"64,169" + │ $10,26.67,"96,283" + │ $20,100.00,"25,677" + │ $30,290.83,"8,829" + │ $50,239.66,"10,714" + │ $100,919.66,"2,792" + │ $500,"6,652.07",386 + │ "$40,000","855,899.99",3 + │ 1,i223, + │ Toa,, + │ ,, + │ ,,"* Based upon 2,567,700" + └──── + + +2 Requirements +══════════════ + + Along with the python requirements that are listed in setup.py and + that are automatically installed when installing this package through + pip, there are a few external requirements for some of the modules. + + I haven’t looked into the minimum required versions of these + dependencies, but I’ll list the versions that I’m using. + + • `pdfimages' 20.09.0 of [Poppler] + • `tesseract' 5.0.0 of [Tesseract] + • `mogrify' 7.0.10 of [ImageMagick] + + +[Poppler] + +[Tesseract] + +[ImageMagick] + + +3 Demo +══════ + + There is a demo module that will download an image given a URL and try + to extract tables from the image and process the cells into a CSV. You + can try it out with one of the images included in this repo. + + 1. `pip3 install table_ocr' + 2. `python3 -m table_ocr.demo + https://raw.githubusercontent.com/eihli/image-table-ocr/master/resources/test_data/simple.png' + + That will run against the following image: + + + + The following should be printed to your terminal after running the + above commands. + + ┌──── + │ Running `extract_tables.main([/tmp/demo_p9on6m8o/simple.png]).` + │ Extracted the following tables from the image: + │ [('/tmp/demo_p9on6m8o/simple.png', ['/tmp/demo_p9on6m8o/simple/table-000.png'])] + │ Processing tables for /tmp/demo_p9on6m8o/simple.png. + │ Processing table /tmp/demo_p9on6m8o/simple/table-000.png. + │ Extracted 18 cells from /tmp/demo_p9on6m8o/simple/table-000.png + │ Cells: + │ /tmp/demo_p9on6m8o/simple/cells/000-000.png: Cell + │ /tmp/demo_p9on6m8o/simple/cells/000-001.png: Format + │ /tmp/demo_p9on6m8o/simple/cells/000-002.png: Formula + │ ... + │ + │ Here is the entire CSV output: + │ + │ Cell,Format,Formula + │ B4,Percentage,None + │ C4,General,None + │ D4,Accounting,None + │ E4,Currency,"=PMT(B4/12,C4,D4)" + │ F4,Currency,=E4*C4 + └──── + + +4 Modules +═════════ + + The package is split into modules with narrow focuses. + + • `pdf_to_images' uses Poppler and ImageMagick to extract images from + a PDF. + • `extract_tables' finds and extracts table-looking things from an + image. + • `extract_cells' extracts and orders cells from a table. + • `ocr_image' uses Tesseract to OCR the text from an image of a cell. + • `ocr_to_csv' converts into a CSV the directory structure that + `ocr_image' outputs. + + The outputs of a previous module can be used by a subsequent module so + that they can be chained together to create the entire workflow, as + demonstrated by the following shell script. + + ┌──── + │ #!/bin/sh + │ + │ PDF=$1 + │ + │ python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt + │ cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt + │ cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt + │ cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} + │ + │ for image in $(cat /tmp/extracted-tables.txt); do + │ dir=$(dirname $image) + │ python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt") + │ done + └──── + + + The package was written in a [literate programming] style. The source + code at + + is meant to act as the documentation and reference material. + + +[literate programming] + + + + + +%package help +Summary: Development documents and examples for table-ocr +Provides: python3-table-ocr-doc +%description help +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + TABLE DETECTION IN IMAGES AND OCR TO CSV + + Eric Ihli + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + +Table of Contents +───────────────── + +1. Overview +2. Requirements +3. Demo +4. Modules + + + + + +1 Overview +══════════ + + This python package contains modules to help with finding and + extracting tabular data from a PDF or image into a CSV format. + + Given an image that contains a table… + + + + Extract the the text into a CSV format… + + ┌──── + │ PRIZE,ODDS 1 IN:,# OF WINNERS* + │ $3,9.09,"282,447" + │ $5,16.66,"154,097" + │ $7,40.01,"64,169" + │ $10,26.67,"96,283" + │ $20,100.00,"25,677" + │ $30,290.83,"8,829" + │ $50,239.66,"10,714" + │ $100,919.66,"2,792" + │ $500,"6,652.07",386 + │ "$40,000","855,899.99",3 + │ 1,i223, + │ Toa,, + │ ,, + │ ,,"* Based upon 2,567,700" + └──── + + +2 Requirements +══════════════ + + Along with the python requirements that are listed in setup.py and + that are automatically installed when installing this package through + pip, there are a few external requirements for some of the modules. + + I haven’t looked into the minimum required versions of these + dependencies, but I’ll list the versions that I’m using. + + • `pdfimages' 20.09.0 of [Poppler] + • `tesseract' 5.0.0 of [Tesseract] + • `mogrify' 7.0.10 of [ImageMagick] + + +[Poppler] + +[Tesseract] + +[ImageMagick] + + +3 Demo +══════ + + There is a demo module that will download an image given a URL and try + to extract tables from the image and process the cells into a CSV. You + can try it out with one of the images included in this repo. + + 1. `pip3 install table_ocr' + 2. `python3 -m table_ocr.demo + https://raw.githubusercontent.com/eihli/image-table-ocr/master/resources/test_data/simple.png' + + That will run against the following image: + + + + The following should be printed to your terminal after running the + above commands. + + ┌──── + │ Running `extract_tables.main([/tmp/demo_p9on6m8o/simple.png]).` + │ Extracted the following tables from the image: + │ [('/tmp/demo_p9on6m8o/simple.png', ['/tmp/demo_p9on6m8o/simple/table-000.png'])] + │ Processing tables for /tmp/demo_p9on6m8o/simple.png. + │ Processing table /tmp/demo_p9on6m8o/simple/table-000.png. + │ Extracted 18 cells from /tmp/demo_p9on6m8o/simple/table-000.png + │ Cells: + │ /tmp/demo_p9on6m8o/simple/cells/000-000.png: Cell + │ /tmp/demo_p9on6m8o/simple/cells/000-001.png: Format + │ /tmp/demo_p9on6m8o/simple/cells/000-002.png: Formula + │ ... + │ + │ Here is the entire CSV output: + │ + │ Cell,Format,Formula + │ B4,Percentage,None + │ C4,General,None + │ D4,Accounting,None + │ E4,Currency,"=PMT(B4/12,C4,D4)" + │ F4,Currency,=E4*C4 + └──── + + +4 Modules +═════════ + + The package is split into modules with narrow focuses. + + • `pdf_to_images' uses Poppler and ImageMagick to extract images from + a PDF. + • `extract_tables' finds and extracts table-looking things from an + image. + • `extract_cells' extracts and orders cells from a table. + • `ocr_image' uses Tesseract to OCR the text from an image of a cell. + • `ocr_to_csv' converts into a CSV the directory structure that + `ocr_image' outputs. + + The outputs of a previous module can be used by a subsequent module so + that they can be chained together to create the entire workflow, as + demonstrated by the following shell script. + + ┌──── + │ #!/bin/sh + │ + │ PDF=$1 + │ + │ python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt + │ cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt + │ cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt + │ cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} + │ + │ for image in $(cat /tmp/extracted-tables.txt); do + │ dir=$(dirname $image) + │ python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt") + │ done + └──── + + + The package was written in a [literate programming] style. The source + code at + + is meant to act as the documentation and reference material. + + +[literate programming] + + + + + +%prep +%autosetup -n table-ocr-0.2.5 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-table-ocr -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 17 2023 Python_Bot - 0.2.5-1 +- Package Spec generated -- cgit v1.2.3