add comments

This commit is contained in:
X
2022-02-25 16:55:02 +01:00
parent d3edf604a9
commit b958ee52fe
5 changed files with 180 additions and 27 deletions

View File

@@ -4,6 +4,11 @@ import argparse
import matplotlib.pyplot as plt
from datetime import datetime
"""
Plots the frequency of hashtags appearing in the set of given posts.
"""
sys.path.insert(0, '../tiktok_downloader')
import file_methods, global_data
@@ -26,6 +31,12 @@ def get_hashtags(obj):
def get_occurrences(filename, n=1 , sort=True):
"""
Takes the json file containing posts and returns the triplet:
l : total posts in the file
k : list of top n hashtags
v_total : frequency of top n hashtags in l
"""
with open(filename) as f:
obj = json.load(f)
l = len(obj)
@@ -34,7 +45,7 @@ def get_occurrences(filename, n=1 , sort=True):
if not sort:
k = list(tags.keys())
v = list(tags.values())
return obj, k, v
return obj, k, v
else:
sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
k = list(sorted_tags.keys())
@@ -59,12 +70,15 @@ def plot(n, length, k, v, img_folder):
def print_occurrences(l, k, v):
"""
Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell.
"""
row_number = 0
total_posts = l
print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))'))
#print(f'Hashtag Occurrences Frequency(Occurances/Total-Posts)')
for key,value in zip(k, v):
ratio = value/total_posts
ratio = value/total_posts
print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
#print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}')
row_number += 1
@@ -72,6 +86,9 @@ def print_occurrences(l, k, v):
def save_plot(plt, img_folder):
"""
Saves the plot to a png file in the folder /data/imgs/
"""
try:
now = datetime.now()
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
@@ -83,6 +100,13 @@ def save_plot(plt, img_folder):
if __name__ == "__main__":
"""
Option "n" specifies how many hashtags does the user wants to plot.
"-d" option prints the hashtag frequencies on the shell
"-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/
The function get_occurances is triggered to compute and return the top n occurances and the hashtags.
"""
img_folder = global_data.IMAGES
file_methods.check_file(img_folder, "dir")
parser = argparse.ArgumentParser()

View File

@@ -4,12 +4,20 @@ from datetime import datetime
import global_data
import file_methods
"""
The file contains several functions that perform data processing related tasks.
"""
Difference = namedtuple("Difference", "new_ids size")
Total = namedtuple("Total", "total unique")
def get_difference(tag, file, ids):
"""
Compares two sets of ids and returns the difference of the two sets.
Purpose - user to filter out the new ids by comparing the set of id list (ids/post_ids.json or videos_ids.json) and the list of newly downloaded ids.
"""
maiden_entry = False
current_id_data = file_methods.get_data(file)
if tag in current_id_data:
@@ -30,6 +38,9 @@ def get_difference(tag, file, ids):
def extract_posts(settings, file_name, tag):
"""
Takes the downloaded file by the tiktok-scraper that contains the posts, and returns the new posts after comparing it the list of posts (from the file ids/post_ids.json) already downloaded.
"""
ids = []
posts = []
new_posts = []
@@ -40,7 +51,7 @@ def extract_posts(settings, file_name, tag):
if not ids:
print(f"WARNING: no posts were found for {tag} in the file - {file_name}")
return
status = file_methods.check_existence(settings["post_ids"], "file")
if not status:
new_data = (ids, posts)
@@ -64,6 +75,9 @@ def extract_posts(settings, file_name, tag):
def extract_videos(settings, tag, download_list):
"""
Tiktok-scraper downloads the videos and puts them in a folder - the list of ids of the downloaded videos is fed to this function as download_list. The function returns the set of new videos after comparing it the list of videos (from the file ids/videos_ids.json) already downloaded.
"""
status = file_methods.check_existence(settings["video_ids"], "file")
if not status:
new_data = download_list
@@ -82,6 +96,9 @@ def extract_videos(settings, tag, download_list):
def update_posts(file_path, file_type, new_data, tag=None):
"""
Updates the list of post ids (in the file ids/post_ids.json) with the ids of the new posts.
"""
try:
status = file_methods.check_existence(file_path, file_type)
if not tag:
@@ -94,6 +111,9 @@ def update_posts(file_path, file_type, new_data, tag=None):
def update_videos(settings, new_data, tag):
"""
Updates the list of video ids (in the file ids/video_ids.json) with the ids of the new videos.
"""
file_path = settings["video_ids"]
file_methods.check_file(file_path, "file")
log = file_methods.id_writer(file_path, new_data, tag, True)
@@ -102,6 +122,9 @@ def update_videos(settings, new_data, tag):
def get_total_posts(file_path, tag):
"""
Returns total count of ids in a id list along with the number of unique ids among them.
"""
status = file_methods.check_existence(file_path, "file")
if not status:
raise OSError("{file_path} not found!")
@@ -114,6 +137,9 @@ def get_total_posts(file_path, tag):
def print_total(file_path, tag, data_type):
"""
Prints the total count for posts or videos for a hashtag. Calls the function get_total_posts for sanity check that there are no repeating ids in the id lists.
"""
total = get_total_posts(file_path, tag)
if (total.total == total.unique):
print(f"Total {data_type} for the hashtag {tag} are: {total.total}")
@@ -121,5 +147,3 @@ def print_total(file_path, tag, data_type):
else:
print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...")
return

View File

@@ -4,7 +4,15 @@ import global_data
import data_methods
"""
The file contains the functions that operate on files, such as writing or reading from files etc.
"""
def create_file(name, file_type):
"""
Creates a file or directory.
"""
if (file_type == "dir"):
os.makedirs(name, mode=0o777)
elif (file_type == "file"):
@@ -15,6 +23,9 @@ def create_file(name, file_type):
def check_existence(file_path, file_type):
"""
Checks the existence of a file or a directory. If not found, returns a False, else returns a true.
"""
if (file_type == "file"):
if os.path.isfile(file_path):
return True
@@ -30,24 +41,32 @@ def check_existence(file_path, file_type):
def check_file(file_path, file_type):
"""
Creates a file or directory, if not found. Else, returns nothing.
"""
status = check_existence(file_path, file_type)
if not status:
create_file(file_path, file_type)
create_file(file_path, file_type)
return
def download_posts(settings, tag):
"""
Runs the tiktok-scraper command to download posts for a given hashtag.
Returns the path to the downloaded file of posts. If no file was downloaded, prints the error and returns nothing in order to move on.
os.chdir is used to execute shell commands in the right folders and then reused to come back to the original folder of execution of run_downloader script.
"""
path = os.path.join(settings["data"], tag, settings["posts"])
os.chdir(path)
try:
tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'"
tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'"
result = subprocess.run([tiktok_command], capture_output=True, shell=True)
if result.stdout:
new_file = result.stdout.decode('utf-8').split()[-1]
if ("json" in new_file):
os.chdir("../../../tiktok_downloader")
return new_file
return new_file
else:
print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!")
os.chdir("../../../tiktok_downloader")
@@ -61,11 +80,16 @@ def download_posts(settings, tag):
def download_videos(settings, tag):
"""
Runs the tiktok-scraper command to download videos for a given hashtag. Note that all the videos are downloaded that are returned by the tiktok api and as a result, its a time and data consuming process.
The list of downloaded video ids is constucted and returned if the downloaded folder contains at least 1 video.
os.chdir is used to execute shell commands in the right folders and then reused to come back to the original folder of execution of run_downloader script.
"""
path = os.path.join(settings["data"], tag, settings["videos"])
os.chdir(path)
try:
# tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d"
tiktok_command = f"tiktok-scraper hashtag {tag} -d"
# tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d"
tiktok_command = f"tiktok-scraper hashtag {tag} -d"
result = subprocess.run([tiktok_command], capture_output=True, shell=True)
if result.stdout:
downloaded_list_tmp = os.listdir(f"./#{tag}")
@@ -74,7 +98,7 @@ def download_videos(settings, tag):
for file in downloaded_list_tmp:
file = file[0:-4]
downloaded_list.append(file)
os.chdir("../../../tiktok_downloader")
return downloaded_list
else:
@@ -85,22 +109,33 @@ def download_videos(settings, tag):
os.chdir("../../../tiktok_downloader")
print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!")
return
except: raise
def get_data(file_path):
"""
Reads the json file and retuns the read data.
"""
with open(file_path, "r") as f:
data = json.load(f)
return data
def dump_data(file_path, data):
"""
Writes the data to the json file.
"""
with open(file_path, "w") as f:
json.dump(data, f)
return
return
def log_writer(log_data):
"""
Creates the dictionary of total downloads (posts and videos) per hashtag.
Example : {timstamp : {hashtag : { videos : number_of_new_videos , posts : number_of_new_posts } } }
Writes the dictionary to the log file (logs/log.json).
"""
total = 0
try:
log_dict = {}
@@ -132,6 +167,9 @@ def log_writer(log_data):
def id_writer(file_path, new_data, tag, status):
"""
Writes the list of new ids to the post_ids or video_ds files.
"""
try:
total = len(new_data)
if status:
@@ -140,7 +178,7 @@ def id_writer(file_path, new_data, tag, status):
if tag in data:
data[tag] += new_data
else:
data[tag]= new_data
data[tag]= new_data
dump_data(file_path, data)
except json.decoder.JSONDecodeError:
data = { tag : new_data }
@@ -155,6 +193,9 @@ def id_writer(file_path, new_data, tag, status):
def post_writer(file_path, new_data, status):
"""
Writes the new posts in the post file of the given hashtag (/data/{hashtag}/posts/data.json)
"""
try:
total = len(new_data)
if status:
@@ -174,6 +215,9 @@ def post_writer(file_path, new_data, status):
def delete_file(file_path, file_type):
"""
Deletes the directory or the file.
"""
if not check_existence(file_path, file_type):
print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!")
elif (file_type == "file"):
@@ -190,12 +234,16 @@ def delete_file(file_path, file_type):
def clean_video_files(settings, tag, new_data=None):
"""
Moves the new videos from the tiktok-scraper video folder to /data/{hashtag}/videos/
Deletes the residual tiktok-scraper video folder.
"""
try:
if new_data:
for file in new_data:
settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4"
subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True)
subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True)
print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.")
except:

View File

@@ -1,3 +1,8 @@
"""
Contains global constants relating to paths and operational parameters such as sleep time between consecutive tiktok-scraper calls.
"""
# Directories
DATA = "../data"
IDS = "ids"
@@ -37,4 +42,3 @@ PARAMETERS = {
# "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper.
"sleep" : 8
}

View File

@@ -8,6 +8,39 @@ import file_methods
import data_methods
"""
The run_downloader.py dowloads data using the tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper).
1. "-p" option is used by the user to download posts only
2. "-v" option is use to download videos only
3. "-p -v" is used to download posts and videos
4. "--h" is used to specify a list of hashtags as arguments
5. "-f" option is used to read the list of hashtags from the user specified file
Example:
1. The command "python3 run_downloader.py --h london paris newyork -p" will download posts for hashtags london, paris and newyork.
2. The command "python3 run_downloader.py -f hashtag_list -p -v" will download posts and videos for hashtags in the file hashtag_list.
The downloaded data is stored in the the data folder. The data is folder is organized as follows:
1. the log subfolder contains the log.json that records total downloads (posts and videos) for each hashtag with a timestamp of when the script was run.
2. the ids subfolder contains post_ids.json and video_ids.json that keep the record of post and video ids that are currently in the data set. This helps to filter out only new posts every time tiktok-scraper is run and only those new posts (or videos) are then stored in the data folder.
3. Each hashtag has a subfolder by its name containing two subfolders, one each for posts and videos.
This scripts runs the function get_data in main which in turn triggers the following sequence:
1. get_posts function is triggered if the user wants to download posts
2. get_videos function is triggered if the user wants to download videos
3. both functions above are sequentially triggered if the user wants to download both posts and videos.
4. After the data is downloaded the log_writer is triggered to log the total number of posts and videos downloaded.
------------Files--------------
global_data - contains global constants relating to paths etc.
data_methods - this file contains data processing methods
file_methods - this file contains methods to write and update data in files
hashtag_list - this file contains the list of hashtags that the user wants to download data for.
"""
command = "python3 post_downloader.py "
@@ -37,6 +70,10 @@ def create_parser():
def set_download_settings(download_data_type):
"""
Loads the constants from global_data into the dict called settings and returns it.
Purpose - easy access to global constants by various functions.
"""
settings = {}
settings["data"] = global_data.FILES["data"]
settings["ids"] = global_data.FILES["ids"]
@@ -54,7 +91,6 @@ def set_download_settings(download_data_type):
elif download_data_type == "videos":
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
settings["number_of_videos"] = global_data.PARAMETERS["number_of_videos"]
return settings
elif download_data_type == "posts-videos":
settings["posts"] = global_data.FILES["posts"]
@@ -62,7 +98,6 @@ def set_download_settings(download_data_type):
settings["data_file"] = global_data.FILES["data_file"]
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
settings["number_of_videos"] = global_data.PARAMETERS["number_of_videos"]
return settings
else:
print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
@@ -71,6 +106,11 @@ def set_download_settings(download_data_type):
def get_posts(settings, tag):
"""
1. calls download_posts in file_methods.py to get the posts for a given hashtag
2. calls extract_posts from data_methods.py to extract new posts if any
3. calls update_posts from data_methods.py to update the id-list with the ids of newly downloaded posts.
"""
file_path = file_methods.download_posts(settings, tag)
log = ()
if file_path:
@@ -80,12 +120,18 @@ def get_posts(settings, tag):
data_methods.update_posts(data_file, "file", new_data[1])
log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag)
file_methods.delete_file(file_path, "file")
return log
def get_videos(settings, tag):
def get_videos(settings, tag):
"""
1. calls download_videos in file_methods.py to get the videos for a given hashtag
2. calls extract_videos from data_methods.py to extract new videos if any
3. calls update_videos from data_methods.py to update the id-list with the ids of newly downloaded videos.
4. the clean_video_files function deletes the residual video folder after the data processing
"""
log = ()
download_list = file_methods.download_videos(settings, tag)
if download_list:
@@ -100,11 +146,15 @@ def get_videos(settings, tag):
def get_data(hashtags, download_data_type):
"""
The function checks for the user option "-p", "-v" or both and then
triggers the functions get_posts, get_videos or both, respectively.
"""
counter = 0
total_hashtags = len(hashtags)
total_hashtags_offset = total_hashtags - 1
log_data = []
if download_data_type == "posts":
settings = set_download_settings(download_data_type)
while counter < total_hashtags:
@@ -116,7 +166,7 @@ def get_data(hashtags, download_data_type):
log = ( res[0], ( "posts", res[1] ) )
log_data.append(log)
data_methods.print_total(settings["post_ids"], tag, download_data_type)
counter += 1
if counter < total_hashtags_offset:
time.sleep(settings["sleep"])
@@ -132,7 +182,7 @@ def get_data(hashtags, download_data_type):
res = ( res[0], ( "videos", res[1]))
log_data.append(res)
data_methods.print_total(settings["video_ids"], tag, download_data_type)
counter += 1
if counter < total_hashtags_offset:
time.sleep(settings["sleep"])
@@ -154,7 +204,7 @@ def get_data(hashtags, download_data_type):
res = ( res[0], (req[0], res[1]) )
log_data.append(res)
data_methods.print_total(settings[req[1]], tag, req[0])
if req_counter < total_reqs_offset:
time.sleep(settings["sleep"])
req_counter += 1
@@ -169,6 +219,9 @@ def get_data(hashtags, download_data_type):
def get_hashtags(file_name, hashtag_list):
"""
Loads and returns the list of hashtags from user specified file.
"""
try:
from hashtag_list import hashtag_list
return hashtag_list
@@ -184,7 +237,7 @@ if __name__ == "__main__":
if not (args.h or args.f):
parser.error("No hashtags were given, please use either --h option or -f to provide hashtags.")
sys.exit()
if not (args.p or args.v):
parser.error("No argument given, please specify either -p for posts or -v videos or both.")
sys.exit()
@@ -206,8 +259,8 @@ if __name__ == "__main__":
download_data_type = "posts"
else:
download_data_type = "videos"
try:
try:
log_data = get_data(hashtags, download_data_type)
if log_data:
file_methods.log_writer(log_data)