mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-08 03:18:31 +03:00
Update run_downloader.py
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import os, sys
|
||||
import time
|
||||
import json
|
||||
import argparse, importlib
|
||||
import argparse
|
||||
|
||||
import global_data
|
||||
import file_methods
|
||||
@@ -13,11 +13,11 @@ The run_downloader.py dowloads data using the tiktok-scraper (https://github.com
|
||||
1. "-p" option is used by the user to download posts only
|
||||
2. "-v" option is use to download videos only
|
||||
3. "-p -v" is used to download posts and videos
|
||||
4. "--h" is used to specify a list of hashtags as arguments
|
||||
4. "-t" is used to specify a list of hashtags as arguments
|
||||
5. "-f" option is used to read the list of hashtags from the user specified file
|
||||
|
||||
Example:
|
||||
1. The command "python3 run_downloader.py --h london paris newyork -p" will download posts for hashtags london, paris and newyork.
|
||||
Example:
|
||||
1. The command "python3 run_downloader.py --h london paris newyork -p" will download posts for hashtags london, paris and newyork.
|
||||
2. The command "python3 run_downloader.py -f hashtag_list -p -v" will download posts and videos for hashtags in the file hashtag_list.
|
||||
|
||||
|
||||
@@ -42,17 +42,15 @@ hashtag_list - this file contains the list of hashtags that the user wants to do
|
||||
"""
|
||||
|
||||
|
||||
command = "python3 post_downloader.py "
|
||||
|
||||
def get_hashtag_list(file_name):
|
||||
try:
|
||||
f = importlib.import_module(file_name) # exec(f"from {file_name} import hashtag_list")
|
||||
print(f.hashtag_list)
|
||||
return f.hashtag_list
|
||||
except ImportError as error:
|
||||
print("ImportError: " + str(error))
|
||||
print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the variable hashtag_list in the file {file_name}")
|
||||
sys.exit()
|
||||
with open(file_name) as f:
|
||||
gn = (line.strip() for line in f if not line.startswith("#"))
|
||||
tags = list(line for line in gn if line)
|
||||
return tags
|
||||
except IOError as error:
|
||||
print(error)
|
||||
|
||||
|
||||
def create_parser():
|
||||
@@ -60,8 +58,7 @@ def create_parser():
|
||||
parser = argparse.ArgumentParser(description="Download the tiktoks for the requested hashtags")
|
||||
|
||||
# Adding the arguments
|
||||
#parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags")
|
||||
parser.add_argument("--h", type=str, nargs="*", help="List of hashtags")
|
||||
parser.add_argument("-t", type=str, nargs="*", help="List of hashtags")
|
||||
parser.add_argument("-f", type=str, help="File name with the list of hashtags")
|
||||
parser.add_argument("-p", action="store_true", help="Download posts")
|
||||
parser.add_argument("-v", action="store_true", help="Download videos")
|
||||
@@ -71,7 +68,7 @@ def create_parser():
|
||||
|
||||
def set_download_settings(download_data_type):
|
||||
"""
|
||||
Loads the constants from global_data into the dict called settings and returns it.
|
||||
Loads the constants from global_data into the dict called settings and returns it.
|
||||
Purpose - easy access to global constants by various functions.
|
||||
"""
|
||||
settings = {}
|
||||
@@ -83,25 +80,16 @@ def set_download_settings(download_data_type):
|
||||
settings["scraper"] = global_data.PARAMETERS["scraper_attempts"]
|
||||
file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
|
||||
file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir")
|
||||
if download_data_type == "posts":
|
||||
if download_data_type["posts"]:
|
||||
settings["posts"] = global_data.FILES["posts"]
|
||||
settings["post_ids"] = global_data.FILES["post_ids"]
|
||||
settings["data_file"] = global_data.FILES["data_file"]
|
||||
return settings
|
||||
elif download_data_type == "videos":
|
||||
|
||||
if download_data_type == "videos":
|
||||
settings["videos"] = global_data.FILES["videos"]
|
||||
settings["video_ids"] = global_data.FILES["video_ids"]
|
||||
return settings
|
||||
elif download_data_type == "posts-videos":
|
||||
settings["posts"] = global_data.FILES["posts"]
|
||||
settings["post_ids"] = global_data.FILES["post_ids"]
|
||||
settings["data_file"] = global_data.FILES["data_file"]
|
||||
settings["videos"] = global_data.FILES["videos"]
|
||||
settings["video_ids"] = global_data.FILES["video_ids"]
|
||||
return settings
|
||||
else:
|
||||
print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
|
||||
sys.exit()
|
||||
|
||||
return settings
|
||||
|
||||
|
||||
|
||||
@@ -120,17 +108,17 @@ def get_posts(settings, tag):
|
||||
data_methods.update_posts(data_file, "file", new_data[1])
|
||||
log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag)
|
||||
file_methods.delete_file(file_path, "file")
|
||||
|
||||
|
||||
return log
|
||||
|
||||
|
||||
|
||||
def get_videos(settings, tag):
|
||||
def get_videos(settings, tag):
|
||||
"""
|
||||
1. calls download_videos in file_methods.py to get the videos for a given hashtag
|
||||
2. calls extract_videos from data_methods.py to extract new videos if any
|
||||
3. calls update_videos from data_methods.py to update the id-list with the ids of newly downloaded videos.
|
||||
4. the clean_video_files function deletes the residual video folder after the data processing
|
||||
4. the clean_video_files function deletes the residual video folder after the data processing
|
||||
"""
|
||||
log = ()
|
||||
download_list = file_methods.download_videos(settings, tag)
|
||||
@@ -148,14 +136,14 @@ def get_videos(settings, tag):
|
||||
def get_data(hashtags, download_data_type):
|
||||
"""
|
||||
The function checks for the user option "-p", "-v" or both and then
|
||||
triggers the functions get_posts, get_videos or both, respectively.
|
||||
triggers the functions get_posts, get_videos or both, respectively.
|
||||
"""
|
||||
counter = 0
|
||||
total_hashtags = len(hashtags)
|
||||
total_hashtags_offset = total_hashtags - 1
|
||||
log_data = []
|
||||
|
||||
if download_data_type == "posts":
|
||||
|
||||
if download_data_type["posts"]:
|
||||
settings = set_download_settings(download_data_type)
|
||||
while counter < total_hashtags:
|
||||
tag = hashtags[counter]
|
||||
@@ -165,12 +153,13 @@ def get_data(hashtags, download_data_type):
|
||||
if res:
|
||||
log = ( res[0], ( "posts", res[1] ) )
|
||||
log_data.append(log)
|
||||
data_methods.print_total(settings["post_ids"], tag, download_data_type)
|
||||
|
||||
data_methods.print_total(settings["post_ids"], tag, "posts")
|
||||
|
||||
counter += 1
|
||||
if counter < total_hashtags_offset:
|
||||
time.sleep(settings["sleep"])
|
||||
elif download_data_type == "videos":
|
||||
|
||||
if download_data_type == "videos":
|
||||
settings = set_download_settings(download_data_type)
|
||||
while counter < total_hashtags:
|
||||
tag = hashtags[counter]
|
||||
@@ -181,43 +170,16 @@ def get_data(hashtags, download_data_type):
|
||||
if res:
|
||||
res = ( res[0], ( "videos", res[1]))
|
||||
log_data.append(res)
|
||||
data_methods.print_total(settings["video_ids"], tag, download_data_type)
|
||||
|
||||
data_methods.print_total(settings["video_ids"], tag, "videos")
|
||||
|
||||
counter += 1
|
||||
if counter < total_hashtags_offset:
|
||||
time.sleep(settings["sleep"])
|
||||
elif download_data_type == "posts-videos":
|
||||
settings = set_download_settings(download_data_type)
|
||||
while counter < total_hashtags:
|
||||
tag = hashtags[counter]
|
||||
file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir")
|
||||
file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file")
|
||||
file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir")
|
||||
settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}"
|
||||
settings["videos_to"] = settings['data'] + f"/{tag}/videos"
|
||||
requests = [("posts", "post_ids", get_posts), ("videos", "video_ids", get_videos)]
|
||||
total_reqs_offset = len(requests) - 1
|
||||
req_counter = 0
|
||||
for req in requests:
|
||||
res = req[2](settings, tag)
|
||||
if res:
|
||||
res = ( res[0], (req[0], res[1]) )
|
||||
log_data.append(res)
|
||||
data_methods.print_total(settings[req[1]], tag, req[0])
|
||||
|
||||
if req_counter < total_reqs_offset:
|
||||
time.sleep(settings["sleep"])
|
||||
req_counter += 1
|
||||
|
||||
counter += 1
|
||||
if counter < total_hashtags_offset:
|
||||
time.sleep(settings["sleep"])
|
||||
else:
|
||||
print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
|
||||
sys.exit()
|
||||
return log_data
|
||||
|
||||
|
||||
|
||||
def get_hashtags(file_name, hashtag_list):
|
||||
"""
|
||||
Loads and returns the list of hashtags from user specified file.
|
||||
@@ -225,42 +187,47 @@ def get_hashtags(file_name, hashtag_list):
|
||||
try:
|
||||
from hashtag_list import hashtag_list
|
||||
return hashtag_list
|
||||
except:
|
||||
print(f"ERROR: something went wrong while reading the file {file_name}!")
|
||||
raise
|
||||
except ImportError:
|
||||
raise ImportError(f"ERROR: something went wrong while reading the file {file_name}!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
if not (args.h or args.f):
|
||||
if not (args.t or args.f):
|
||||
parser.error("No hashtags were given, please use either --h option or -f to provide hashtags.")
|
||||
sys.exit()
|
||||
|
||||
|
||||
if not (args.p or args.v):
|
||||
parser.error("No argument given, please specify either -p for posts or -v videos or both.")
|
||||
sys.exit()
|
||||
|
||||
if args.h:
|
||||
hashtags = args.h
|
||||
if args.t:
|
||||
hashtags = args.t
|
||||
elif args.f:
|
||||
file_name = args.f
|
||||
hashtags = get_hashtag_list(file_name)
|
||||
|
||||
print(hashtags)
|
||||
if not hashtags:
|
||||
print("No hashtags were given, please use either --h option or -f to provide hashtags.")
|
||||
sys.exit(0)
|
||||
raise Exception("No hashtags were given, please use either --h option or -f to provide hashtags.")
|
||||
|
||||
if (args.p and args.v):
|
||||
download_data_type = "posts-videos"
|
||||
download_data_type = {
|
||||
"posts": True,
|
||||
"videos": True
|
||||
}
|
||||
elif args.p:
|
||||
download_data_type = "posts"
|
||||
download_data_type = {
|
||||
"posts": True,
|
||||
"videos": False
|
||||
}
|
||||
else:
|
||||
download_data_type = "videos"
|
||||
|
||||
try:
|
||||
download_data_type = {
|
||||
"posts": False,
|
||||
"videos": True
|
||||
}
|
||||
|
||||
try:
|
||||
log_data = get_data(hashtags, download_data_type)
|
||||
if log_data:
|
||||
file_methods.log_writer(log_data)
|
||||
|
||||
Reference in New Issue
Block a user