diff options
author | CoprDistGit <infra@openeuler.org> | 2023-05-10 04:10:29 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-05-10 04:10:29 +0000 |
commit | 51608f9b8a241734c9b2a070b0f1c067ea194bc6 (patch) | |
tree | d0b90fafd67577b11879c2b29204263720208175 | |
parent | 71a93c4ada6b0ab27561fcff4440e734be1cd08f (diff) |
automatic import of python-clarku-youtube-crawleropeneuler20.03
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | python-clarku-youtube-crawler.spec | 575 | ||||
-rw-r--r-- | sources | 1 |
3 files changed, 577 insertions, 0 deletions
@@ -0,0 +1 @@ +/clarku_youtube_crawler-2.1.3.tar.gz diff --git a/python-clarku-youtube-crawler.spec b/python-clarku-youtube-crawler.spec new file mode 100644 index 0000000..d9ef6c8 --- /dev/null +++ b/python-clarku-youtube-crawler.spec @@ -0,0 +1,575 @@ +%global _empty_manifest_terminate_build 0 +Name: python-clarku-youtube-crawler +Version: 2.1.3 +Release: 1 +Summary: Clark University, Package for YouTube crawler and cleaning data +License: MIT License +URL: https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler +Source0: https://mirrors.nju.edu.cn/pypi/web/packages/b9/62/9017def4482727c467e8502ac3c0b70d3dded5bd32e1d619a63366df0d8c/clarku_youtube_crawler-2.1.3.tar.gz +BuildArch: noarch + +Requires: python3-configparser +Requires: python3-datetime +Requires: python3-pytz +Requires: python3-pandas +Requires: python3-isodate +Requires: python3-xlrd +Requires: python3-youtube-transcript-api +Requires: python3-google-api-python-client + +%description +# clarku-youtube-crawler + +Clark University YouTube crawler and JSON decoder for YouTube json. Please read documentation in ``DOCS`` + +Pypi page: "https://pypi.org/project/clarku-youtube-crawler/" + +## Installing +To install, + +``pip install clarku-youtube-crawler`` + +The crawler needs multiple other packages to function. +If missing requirements (I already include all dependencies so it shouldn't happen), download <a href="https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler/blob/master/requirements.txt">``requirements.txt`` </a> . +Navigate to the folder where it contains requirements.txt and run + +``pip install -r requirements.txt`` + + +## Upgrading +To upgrade + +``pip install clarku-youtube-crawler --upgrade`` + +Go to the project folder, delete config.ini if it is already there. + +## YouTube API Key +- Go to https://cloud.google.com/, click console, and create a project. Under Credentials, copy the API key. +- In your project folder, create a "DEVELOPER_KEY.txt" file (must be this file name) and paste your API key. +- You can use multiple API keys by putting them on different lines in DEVELOPER_KEY.txt. +- The crawler will use up all quotas of one key and try next one, until all quotas are used up. + + + +## Example usage +Case 1: crawl videos by keywords, +``` +import clarku_youtube_crawler as cu + +# Crawl all JSONs +crawler = cu.RawCrawler() +crawler.build("low visibility") +crawler.crawl("low visibility", start_date=14, start_month=12, start_year=2020, day_count=5) +crawler.crawl("blind", start_date=14, start_month=12, start_year=2020, day_count=5) +crawler.merge_to_workfile() +crawler.crawl_videos_in_list(comment_page_count=1) +crawler.merge_all(save_to='low visibility/all_videos.json') + +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file='low visibility/all_videos.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build("low visibility") +subtitleCrawler.crawl_csv( + videos_to_collect="low visibility/videos_to_collect.csv", + video_id="videoId", + sub_title_dir="low visibility/subtitles/" +) + +``` + +Case 2: crawl a videos by a list of ids specified by videoId column in an input CSV +``` +import clarku_youtube_crawler as cu + +crawler = cu.RawCrawler() +work_dir = "blind" +crawler.build(work_dir) + +# update videos_to_collect.csv to your csv file. Specify the column of video id by video_id +# video ids must be ":" + YouTube video id. E.g., ":wl4m1Rqmq-Y" + +crawler.crawl_videos_in_list(video_list_workfile="videos_to_collect.csv", + comment_page_count=1, + search_key="blind", + video_id="videoId" + ) +crawler.merge_all(save_to='all_raw_data.json') +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file='all_raw_data.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect="videos_to_collect.csv", + video_id="videoId", + sub_title_dir=f"YouTube_CSV/subtitles/" +) + +``` + +Case 3: Search a list of channels by search keys, then crawl all videos belonging to those channels. +``` +import clarku_youtube_crawler as cu + +chCrawler = cu.ChannelCrawler() +work_dir = "low visibility" +chCrawler.build(work_dir) +# You can search different channels. All results will be merged +chCrawler.search_channel("low visibility") +chCrawler.search_channel("blind") +chCrawler.merge_to_workfile() +chCrawler.crawl() + +# Crawl videos posted by selected channels. channels_to_collect.csv file has which search keys find each channel +crawler = cu.RawCrawler() +crawler.build(work_dir) +crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/") +crawler.crawl_videos_in_list(comment_page_count=1) +crawler.merge_all() + +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file=work_dir + '/all_videos_visibility.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect=work_dir+"/videos_to_collect.csv", + video_id="videoId", + sub_title_dir=work_dir+"/subtitles/" +) +``` + +Case 4: You already have a list of channels. You want to crawl all videos of the channels in the list: +``` +import clarku_youtube_crawler as cu + +work_dir = 'disability' +chCrawler = cu.ChannelCrawler() +chCrawler.build(work_dir) + +chCrawler.crawl(filename='mturk_test.csv', channel_header="Input.channelId") + +# Crawl videos posted by selected channels +crawler = cu.RawCrawler() +crawler.build(work_dir) +crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/") +crawler.crawl_videos_in_list(comment_page_count=10) # 100 comments per page, 10 page will crawl 1000 comments + +crawler.merge_all() +# +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file=work_dir + '/all_videos.json') + +# Crawl subtitles from CSV +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect=work_dir + "/videos_to_collect.csv", + video_id="videoId", + sub_title_dir=work_dir + "/subtitles/" +) +``` + + + + + + +%package -n python3-clarku-youtube-crawler +Summary: Clark University, Package for YouTube crawler and cleaning data +Provides: python-clarku-youtube-crawler +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-pip +%description -n python3-clarku-youtube-crawler +# clarku-youtube-crawler + +Clark University YouTube crawler and JSON decoder for YouTube json. Please read documentation in ``DOCS`` + +Pypi page: "https://pypi.org/project/clarku-youtube-crawler/" + +## Installing +To install, + +``pip install clarku-youtube-crawler`` + +The crawler needs multiple other packages to function. +If missing requirements (I already include all dependencies so it shouldn't happen), download <a href="https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler/blob/master/requirements.txt">``requirements.txt`` </a> . +Navigate to the folder where it contains requirements.txt and run + +``pip install -r requirements.txt`` + + +## Upgrading +To upgrade + +``pip install clarku-youtube-crawler --upgrade`` + +Go to the project folder, delete config.ini if it is already there. + +## YouTube API Key +- Go to https://cloud.google.com/, click console, and create a project. Under Credentials, copy the API key. +- In your project folder, create a "DEVELOPER_KEY.txt" file (must be this file name) and paste your API key. +- You can use multiple API keys by putting them on different lines in DEVELOPER_KEY.txt. +- The crawler will use up all quotas of one key and try next one, until all quotas are used up. + + + +## Example usage +Case 1: crawl videos by keywords, +``` +import clarku_youtube_crawler as cu + +# Crawl all JSONs +crawler = cu.RawCrawler() +crawler.build("low visibility") +crawler.crawl("low visibility", start_date=14, start_month=12, start_year=2020, day_count=5) +crawler.crawl("blind", start_date=14, start_month=12, start_year=2020, day_count=5) +crawler.merge_to_workfile() +crawler.crawl_videos_in_list(comment_page_count=1) +crawler.merge_all(save_to='low visibility/all_videos.json') + +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file='low visibility/all_videos.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build("low visibility") +subtitleCrawler.crawl_csv( + videos_to_collect="low visibility/videos_to_collect.csv", + video_id="videoId", + sub_title_dir="low visibility/subtitles/" +) + +``` + +Case 2: crawl a videos by a list of ids specified by videoId column in an input CSV +``` +import clarku_youtube_crawler as cu + +crawler = cu.RawCrawler() +work_dir = "blind" +crawler.build(work_dir) + +# update videos_to_collect.csv to your csv file. Specify the column of video id by video_id +# video ids must be ":" + YouTube video id. E.g., ":wl4m1Rqmq-Y" + +crawler.crawl_videos_in_list(video_list_workfile="videos_to_collect.csv", + comment_page_count=1, + search_key="blind", + video_id="videoId" + ) +crawler.merge_all(save_to='all_raw_data.json') +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file='all_raw_data.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect="videos_to_collect.csv", + video_id="videoId", + sub_title_dir=f"YouTube_CSV/subtitles/" +) + +``` + +Case 3: Search a list of channels by search keys, then crawl all videos belonging to those channels. +``` +import clarku_youtube_crawler as cu + +chCrawler = cu.ChannelCrawler() +work_dir = "low visibility" +chCrawler.build(work_dir) +# You can search different channels. All results will be merged +chCrawler.search_channel("low visibility") +chCrawler.search_channel("blind") +chCrawler.merge_to_workfile() +chCrawler.crawl() + +# Crawl videos posted by selected channels. channels_to_collect.csv file has which search keys find each channel +crawler = cu.RawCrawler() +crawler.build(work_dir) +crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/") +crawler.crawl_videos_in_list(comment_page_count=1) +crawler.merge_all() + +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file=work_dir + '/all_videos_visibility.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect=work_dir+"/videos_to_collect.csv", + video_id="videoId", + sub_title_dir=work_dir+"/subtitles/" +) +``` + +Case 4: You already have a list of channels. You want to crawl all videos of the channels in the list: +``` +import clarku_youtube_crawler as cu + +work_dir = 'disability' +chCrawler = cu.ChannelCrawler() +chCrawler.build(work_dir) + +chCrawler.crawl(filename='mturk_test.csv', channel_header="Input.channelId") + +# Crawl videos posted by selected channels +crawler = cu.RawCrawler() +crawler.build(work_dir) +crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/") +crawler.crawl_videos_in_list(comment_page_count=10) # 100 comments per page, 10 page will crawl 1000 comments + +crawler.merge_all() +# +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file=work_dir + '/all_videos.json') + +# Crawl subtitles from CSV +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect=work_dir + "/videos_to_collect.csv", + video_id="videoId", + sub_title_dir=work_dir + "/subtitles/" +) +``` + + + + + + +%package help +Summary: Development documents and examples for clarku-youtube-crawler +Provides: python3-clarku-youtube-crawler-doc +%description help +# clarku-youtube-crawler + +Clark University YouTube crawler and JSON decoder for YouTube json. Please read documentation in ``DOCS`` + +Pypi page: "https://pypi.org/project/clarku-youtube-crawler/" + +## Installing +To install, + +``pip install clarku-youtube-crawler`` + +The crawler needs multiple other packages to function. +If missing requirements (I already include all dependencies so it shouldn't happen), download <a href="https://github.com/ClarkUniversity-NiuLab/clarku-youtube-crawler/blob/master/requirements.txt">``requirements.txt`` </a> . +Navigate to the folder where it contains requirements.txt and run + +``pip install -r requirements.txt`` + + +## Upgrading +To upgrade + +``pip install clarku-youtube-crawler --upgrade`` + +Go to the project folder, delete config.ini if it is already there. + +## YouTube API Key +- Go to https://cloud.google.com/, click console, and create a project. Under Credentials, copy the API key. +- In your project folder, create a "DEVELOPER_KEY.txt" file (must be this file name) and paste your API key. +- You can use multiple API keys by putting them on different lines in DEVELOPER_KEY.txt. +- The crawler will use up all quotas of one key and try next one, until all quotas are used up. + + + +## Example usage +Case 1: crawl videos by keywords, +``` +import clarku_youtube_crawler as cu + +# Crawl all JSONs +crawler = cu.RawCrawler() +crawler.build("low visibility") +crawler.crawl("low visibility", start_date=14, start_month=12, start_year=2020, day_count=5) +crawler.crawl("blind", start_date=14, start_month=12, start_year=2020, day_count=5) +crawler.merge_to_workfile() +crawler.crawl_videos_in_list(comment_page_count=1) +crawler.merge_all(save_to='low visibility/all_videos.json') + +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file='low visibility/all_videos.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build("low visibility") +subtitleCrawler.crawl_csv( + videos_to_collect="low visibility/videos_to_collect.csv", + video_id="videoId", + sub_title_dir="low visibility/subtitles/" +) + +``` + +Case 2: crawl a videos by a list of ids specified by videoId column in an input CSV +``` +import clarku_youtube_crawler as cu + +crawler = cu.RawCrawler() +work_dir = "blind" +crawler.build(work_dir) + +# update videos_to_collect.csv to your csv file. Specify the column of video id by video_id +# video ids must be ":" + YouTube video id. E.g., ":wl4m1Rqmq-Y" + +crawler.crawl_videos_in_list(video_list_workfile="videos_to_collect.csv", + comment_page_count=1, + search_key="blind", + video_id="videoId" + ) +crawler.merge_all(save_to='all_raw_data.json') +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file='all_raw_data.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect="videos_to_collect.csv", + video_id="videoId", + sub_title_dir=f"YouTube_CSV/subtitles/" +) + +``` + +Case 3: Search a list of channels by search keys, then crawl all videos belonging to those channels. +``` +import clarku_youtube_crawler as cu + +chCrawler = cu.ChannelCrawler() +work_dir = "low visibility" +chCrawler.build(work_dir) +# You can search different channels. All results will be merged +chCrawler.search_channel("low visibility") +chCrawler.search_channel("blind") +chCrawler.merge_to_workfile() +chCrawler.crawl() + +# Crawl videos posted by selected channels. channels_to_collect.csv file has which search keys find each channel +crawler = cu.RawCrawler() +crawler.build(work_dir) +crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/") +crawler.crawl_videos_in_list(comment_page_count=1) +crawler.merge_all() + +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file=work_dir + '/all_videos_visibility.json') + +# Crawl subtitles from CSV +# If you don't need subtitles, delete the following lines +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect=work_dir+"/videos_to_collect.csv", + video_id="videoId", + sub_title_dir=work_dir+"/subtitles/" +) +``` + +Case 4: You already have a list of channels. You want to crawl all videos of the channels in the list: +``` +import clarku_youtube_crawler as cu + +work_dir = 'disability' +chCrawler = cu.ChannelCrawler() +chCrawler.build(work_dir) + +chCrawler.crawl(filename='mturk_test.csv', channel_header="Input.channelId") + +# Crawl videos posted by selected channels +crawler = cu.RawCrawler() +crawler.build(work_dir) +crawler.merge_to_workfile(file_dir=work_dir + "/video_search_list/") +crawler.crawl_videos_in_list(comment_page_count=10) # 100 comments per page, 10 page will crawl 1000 comments + +crawler.merge_all() +# +# Convert JSON to CSV +decoder = cu.JSONDecoder() +decoder.json_to_csv(data_file=work_dir + '/all_videos.json') + +# Crawl subtitles from CSV +subtitleCrawler = cu.SubtitleCrawler() +subtitleCrawler.build(work_dir) +subtitleCrawler.crawl_csv( + videos_to_collect=work_dir + "/videos_to_collect.csv", + video_id="videoId", + sub_title_dir=work_dir + "/subtitles/" +) +``` + + + + + + +%prep +%autosetup -n clarku-youtube-crawler-2.1.3 + +%build +%py3_build + +%install +%py3_install +install -d -m755 %{buildroot}/%{_pkgdocdir} +if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi +if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi +if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi +if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . + +%files -n python3-clarku-youtube-crawler -f filelist.lst +%dir %{python3_sitelib}/* + +%files help -f doclist.lst +%{_docdir}/* + +%changelog +* Wed May 10 2023 Python_Bot <Python_Bot@openeuler.org> - 2.1.3-1 +- Package Spec generated @@ -0,0 +1 @@ +c7393dd6945a830534fb5c72675c2af5 clarku_youtube_crawler-2.1.3.tar.gz |