summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2023-05-10 04:10:29 +0000
committerCoprDistGit <infra@openeuler.org>2023-05-10 04:10:29 +0000
commit51608f9b8a241734c9b2a070b0f1c067ea194bc6 (patch)
treed0b90fafd67577b11879c2b29204263720208175
parent71a93c4ada6b0ab27561fcff4440e734be1cd08f (diff)
automatic import of python-clarku-youtube-crawleropeneuler20.03
-rw-r--r--.gitignore1
-rw-r--r--python-clarku-youtube-crawler.spec575
-rw-r--r--sources1
3 files changed, 577 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..9e36b60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/clarku_youtube_crawler-2.1.3.tar.gz
diff --git a/python-clarku-youtube-crawler.spec b/python-clarku-youtube-crawler.spec
new file mode 100644
index 0000000..d9ef6c8
--- /dev/null
+++ b/python-clarku-youtube-crawler.spec
@@ -0,0 +1,575 @@
+%global _empty_manifest_terminate_build 0
+Name: python-clarku-youtube-crawler
+Version: 2.1.3
+Release: 1
+Summary: Clark University, Package for YouTube crawler and cleaning data
+License: MIT License
+URL: https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler
+Source0: https://mirrors.nju.edu.cn/pypi/web/packages/b9/62/9017def4482727c467e8502ac3c0b70d3dded5bd32e1d619a63366df0d8c/clarku_youtube_crawler-2.1.3.tar.gz
+BuildArch: noarch
+
+Requires: python3-configparser
+Requires: python3-datetime
+Requires: python3-pytz
+Requires: python3-pandas
+Requires: python3-isodate
+Requires: python3-xlrd
+Requires: python3-youtube-transcript-api
+Requires: python3-google-api-python-client
+
+%description
+# clarku-youtube-crawler
+
+Clark University YouTube crawler and JSON decoder for YouTube json. Please read documentation in ``DOCS``
+
+Pypi page: "https://pypi.org/project/clarku-youtube-crawler/"
+
+## Installing
+To install,
+
+``pip install clarku-youtube-crawler``
+
+The crawler needs multiple other packages to function.
+If missing requirements (I already include all dependencies so it shouldn't happen), download <a href="https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler/blob/master/requirements.txt">``requirements.txt`` </a> .
+Navigate to the folder where it contains requirements.txt and run
+
+``pip install -r requirements.txt``
+
+
+## Upgrading
+To upgrade
+
+``pip install clarku-youtube-crawler --upgrade``
+
+Go to the project folder, delete config.ini if it is already there.
+
+## YouTube API Key
+- Go to https://cloud.google.com/, click console, and create a project. Under Credentials, copy the API key.
+- In your project folder, create a "DEVELOPER_KEY.txt" file (must be this file name) and paste your API key.
+- You can use multiple API keys by putting them on different lines in DEVELOPER_KEY.txt.
+- The crawler will use up all quotas of one key and try next one, until all quotas are used up.
+
+
+
+## Example usage
+Case 1: crawl videos by keywords,
+```
+import clarku_youtube_crawler as cu
+
+# Crawl all JSONs
+crawler = cu.RawCrawler()
+crawler.build("low visibility")
+crawler.crawl("low visibility", start_date=14, start_month=12, start_year=2020, day_count=5)
+crawler.crawl("blind", start_date=14, start_month=12, start_year=2020, day_count=5)
+crawler.merge_to_workfile()
+crawler.crawl_videos_in_list(comment_page_count=1)
+crawler.merge_all(save_to='low visibility/all_videos.json')
+
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file='low visibility/all_videos.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build("low visibility")
+subtitleCrawler.crawl_csv(
+ videos_to_collect="low visibility/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir="low visibility/subtitles/"
+)
+
+```
+
+Case 2: crawl a videos by a list of ids specified by videoId column in an input CSV
+```
+import clarku_youtube_crawler as cu
+
+crawler = cu.RawCrawler()
+work_dir = "blind"
+crawler.build(work_dir)
+
+# update videos_to_collect.csv to your csv file. Specify the column of video id by video_id
+# video ids must be ":" + YouTube video id. E.g., ":wl4m1Rqmq-Y"
+
+crawler.crawl_videos_in_list(video_list_workfile="videos_to_collect.csv",
+ comment_page_count=1,
+ search_key="blind",
+ video_id="videoId"
+ )
+crawler.merge_all(save_to='all_raw_data.json')
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file='all_raw_data.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect="videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=f"YouTube_CSV/subtitles/"
+)
+
+```
+
+Case 3: Search a list of channels by search keys, then crawl all videos belonging to those channels.
+```
+import clarku_youtube_crawler as cu
+
+chCrawler = cu.ChannelCrawler()
+work_dir = "low visibility"
+chCrawler.build(work_dir)
+# You can search different channels. All results will be merged
+chCrawler.search_channel("low visibility")
+chCrawler.search_channel("blind")
+chCrawler.merge_to_workfile()
+chCrawler.crawl()
+
+# Crawl videos posted by selected channels. channels_to_collect.csv file has which search keys find each channel
+crawler = cu.RawCrawler()
+crawler.build(work_dir)
+crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/")
+crawler.crawl_videos_in_list(comment_page_count=1)
+crawler.merge_all()
+
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file=work_dir + '/all_videos_visibility.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect=work_dir+"/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=work_dir+"/subtitles/"
+)
+```
+
+Case 4: You already have a list of channels. You want to crawl all videos of the channels in the list:
+```
+import clarku_youtube_crawler as cu
+
+work_dir = 'disability'
+chCrawler = cu.ChannelCrawler()
+chCrawler.build(work_dir)
+
+chCrawler.crawl(filename='mturk_test.csv', channel_header="Input.channelId")
+
+# Crawl videos posted by selected channels
+crawler = cu.RawCrawler()
+crawler.build(work_dir)
+crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/")
+crawler.crawl_videos_in_list(comment_page_count=10) # 100 comments per page, 10 page will crawl 1000 comments
+
+crawler.merge_all()
+#
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file=work_dir + '/all_videos.json')
+
+# Crawl subtitles from CSV
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect=work_dir + "/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=work_dir + "/subtitles/"
+)
+```
+
+
+
+
+
+
+%package -n python3-clarku-youtube-crawler
+Summary: Clark University, Package for YouTube crawler and cleaning data
+Provides: python-clarku-youtube-crawler
+BuildRequires: python3-devel
+BuildRequires: python3-setuptools
+BuildRequires: python3-pip
+%description -n python3-clarku-youtube-crawler
+# clarku-youtube-crawler
+
+Clark University YouTube crawler and JSON decoder for YouTube json. Please read documentation in ``DOCS``
+
+Pypi page: "https://pypi.org/project/clarku-youtube-crawler/"
+
+## Installing
+To install,
+
+``pip install clarku-youtube-crawler``
+
+The crawler needs multiple other packages to function.
+If missing requirements (I already include all dependencies so it shouldn't happen), download <a href="https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler/blob/master/requirements.txt">``requirements.txt`` </a> .
+Navigate to the folder where it contains requirements.txt and run
+
+``pip install -r requirements.txt``
+
+
+## Upgrading
+To upgrade
+
+``pip install clarku-youtube-crawler --upgrade``
+
+Go to the project folder, delete config.ini if it is already there.
+
+## YouTube API Key
+- Go to https://cloud.google.com/, click console, and create a project. Under Credentials, copy the API key.
+- In your project folder, create a "DEVELOPER_KEY.txt" file (must be this file name) and paste your API key.
+- You can use multiple API keys by putting them on different lines in DEVELOPER_KEY.txt.
+- The crawler will use up all quotas of one key and try next one, until all quotas are used up.
+
+
+
+## Example usage
+Case 1: crawl videos by keywords,
+```
+import clarku_youtube_crawler as cu
+
+# Crawl all JSONs
+crawler = cu.RawCrawler()
+crawler.build("low visibility")
+crawler.crawl("low visibility", start_date=14, start_month=12, start_year=2020, day_count=5)
+crawler.crawl("blind", start_date=14, start_month=12, start_year=2020, day_count=5)
+crawler.merge_to_workfile()
+crawler.crawl_videos_in_list(comment_page_count=1)
+crawler.merge_all(save_to='low visibility/all_videos.json')
+
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file='low visibility/all_videos.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build("low visibility")
+subtitleCrawler.crawl_csv(
+ videos_to_collect="low visibility/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir="low visibility/subtitles/"
+)
+
+```
+
+Case 2: crawl a videos by a list of ids specified by videoId column in an input CSV
+```
+import clarku_youtube_crawler as cu
+
+crawler = cu.RawCrawler()
+work_dir = "blind"
+crawler.build(work_dir)
+
+# update videos_to_collect.csv to your csv file. Specify the column of video id by video_id
+# video ids must be ":" + YouTube video id. E.g., ":wl4m1Rqmq-Y"
+
+crawler.crawl_videos_in_list(video_list_workfile="videos_to_collect.csv",
+ comment_page_count=1,
+ search_key="blind",
+ video_id="videoId"
+ )
+crawler.merge_all(save_to='all_raw_data.json')
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file='all_raw_data.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect="videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=f"YouTube_CSV/subtitles/"
+)
+
+```
+
+Case 3: Search a list of channels by search keys, then crawl all videos belonging to those channels.
+```
+import clarku_youtube_crawler as cu
+
+chCrawler = cu.ChannelCrawler()
+work_dir = "low visibility"
+chCrawler.build(work_dir)
+# You can search different channels. All results will be merged
+chCrawler.search_channel("low visibility")
+chCrawler.search_channel("blind")
+chCrawler.merge_to_workfile()
+chCrawler.crawl()
+
+# Crawl videos posted by selected channels. channels_to_collect.csv file has which search keys find each channel
+crawler = cu.RawCrawler()
+crawler.build(work_dir)
+crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/")
+crawler.crawl_videos_in_list(comment_page_count=1)
+crawler.merge_all()
+
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file=work_dir + '/all_videos_visibility.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect=work_dir+"/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=work_dir+"/subtitles/"
+)
+```
+
+Case 4: You already have a list of channels. You want to crawl all videos of the channels in the list:
+```
+import clarku_youtube_crawler as cu
+
+work_dir = 'disability'
+chCrawler = cu.ChannelCrawler()
+chCrawler.build(work_dir)
+
+chCrawler.crawl(filename='mturk_test.csv', channel_header="Input.channelId")
+
+# Crawl videos posted by selected channels
+crawler = cu.RawCrawler()
+crawler.build(work_dir)
+crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/")
+crawler.crawl_videos_in_list(comment_page_count=10) # 100 comments per page, 10 page will crawl 1000 comments
+
+crawler.merge_all()
+#
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file=work_dir + '/all_videos.json')
+
+# Crawl subtitles from CSV
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect=work_dir + "/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=work_dir + "/subtitles/"
+)
+```
+
+
+
+
+
+
+%package help
+Summary: Development documents and examples for clarku-youtube-crawler
+Provides: python3-clarku-youtube-crawler-doc
+%description help
+# clarku-youtube-crawler
+
+Clark University YouTube crawler and JSON decoder for YouTube json. Please read documentation in ``DOCS``
+
+Pypi page: "https://pypi.org/project/clarku-youtube-crawler/"
+
+## Installing
+To install,
+
+``pip install clarku-youtube-crawler``
+
+The crawler needs multiple other packages to function.
+If missing requirements (I already include all dependencies so it shouldn't happen), download <a href="https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler/blob/master/requirements.txt">``requirements.txt`` </a> .
+Navigate to the folder where it contains requirements.txt and run
+
+``pip install -r requirements.txt``
+
+
+## Upgrading
+To upgrade
+
+``pip install clarku-youtube-crawler --upgrade``
+
+Go to the project folder, delete config.ini if it is already there.
+
+## YouTube API Key
+- Go to https://cloud.google.com/, click console, and create a project. Under Credentials, copy the API key.
+- In your project folder, create a "DEVELOPER_KEY.txt" file (must be this file name) and paste your API key.
+- You can use multiple API keys by putting them on different lines in DEVELOPER_KEY.txt.
+- The crawler will use up all quotas of one key and try next one, until all quotas are used up.
+
+
+
+## Example usage
+Case 1: crawl videos by keywords,
+```
+import clarku_youtube_crawler as cu
+
+# Crawl all JSONs
+crawler = cu.RawCrawler()
+crawler.build("low visibility")
+crawler.crawl("low visibility", start_date=14, start_month=12, start_year=2020, day_count=5)
+crawler.crawl("blind", start_date=14, start_month=12, start_year=2020, day_count=5)
+crawler.merge_to_workfile()
+crawler.crawl_videos_in_list(comment_page_count=1)
+crawler.merge_all(save_to='low visibility/all_videos.json')
+
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file='low visibility/all_videos.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build("low visibility")
+subtitleCrawler.crawl_csv(
+ videos_to_collect="low visibility/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir="low visibility/subtitles/"
+)
+
+```
+
+Case 2: crawl a videos by a list of ids specified by videoId column in an input CSV
+```
+import clarku_youtube_crawler as cu
+
+crawler = cu.RawCrawler()
+work_dir = "blind"
+crawler.build(work_dir)
+
+# update videos_to_collect.csv to your csv file. Specify the column of video id by video_id
+# video ids must be ":" + YouTube video id. E.g., ":wl4m1Rqmq-Y"
+
+crawler.crawl_videos_in_list(video_list_workfile="videos_to_collect.csv",
+ comment_page_count=1,
+ search_key="blind",
+ video_id="videoId"
+ )
+crawler.merge_all(save_to='all_raw_data.json')
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file='all_raw_data.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect="videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=f"YouTube_CSV/subtitles/"
+)
+
+```
+
+Case 3: Search a list of channels by search keys, then crawl all videos belonging to those channels.
+```
+import clarku_youtube_crawler as cu
+
+chCrawler = cu.ChannelCrawler()
+work_dir = "low visibility"
+chCrawler.build(work_dir)
+# You can search different channels. All results will be merged
+chCrawler.search_channel("low visibility")
+chCrawler.search_channel("blind")
+chCrawler.merge_to_workfile()
+chCrawler.crawl()
+
+# Crawl videos posted by selected channels. channels_to_collect.csv file has which search keys find each channel
+crawler = cu.RawCrawler()
+crawler.build(work_dir)
+crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/")
+crawler.crawl_videos_in_list(comment_page_count=1)
+crawler.merge_all()
+
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file=work_dir + '/all_videos_visibility.json')
+
+# Crawl subtitles from CSV
+# If you don't need subtitles, delete the following lines
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect=work_dir+"/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=work_dir+"/subtitles/"
+)
+```
+
+Case 4: You already have a list of channels. You want to crawl all videos of the channels in the list:
+```
+import clarku_youtube_crawler as cu
+
+work_dir = 'disability'
+chCrawler = cu.ChannelCrawler()
+chCrawler.build(work_dir)
+
+chCrawler.crawl(filename='mturk_test.csv', channel_header="Input.channelId")
+
+# Crawl videos posted by selected channels
+crawler = cu.RawCrawler()
+crawler.build(work_dir)
+crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/")
+crawler.crawl_videos_in_list(comment_page_count=10) # 100 comments per page, 10 page will crawl 1000 comments
+
+crawler.merge_all()
+#
+# Convert JSON to CSV
+decoder = cu.JSONDecoder()
+decoder.json_to_csv(data_file=work_dir + '/all_videos.json')
+
+# Crawl subtitles from CSV
+subtitleCrawler = cu.SubtitleCrawler()
+subtitleCrawler.build(work_dir)
+subtitleCrawler.crawl_csv(
+ videos_to_collect=work_dir + "/videos_to_collect.csv",
+ video_id="videoId",
+ sub_title_dir=work_dir + "/subtitles/"
+)
+```
+
+
+
+
+
+
+%prep
+%autosetup -n clarku-youtube-crawler-2.1.3
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+ find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+ find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+ find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+ find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+ find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-clarku-youtube-crawler -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Wed May 10 2023 Python_Bot <Python_Bot@openeuler.org> - 2.1.3-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..655d1c0
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+c7393dd6945a830534fb5c72675c2af5 clarku_youtube_crawler-2.1.3.tar.gz