diff --git a/analytics/hashtag_frequencies.py b/analytics/hashtag_frequencies.py new file mode 100644 index 0000000..3afd405 --- /dev/null +++ b/analytics/hashtag_frequencies.py @@ -0,0 +1,90 @@ +import os, sys +import csv, json +import argparse +import matplotlib.pyplot as plt + + + +def get_hashtags(obj): + if not obj: + print(f'ERROR: Empty item, no hashtags to be extracted.') + return + else: + hashtags = {} + l = len(obj) + for i in range(l): + for hashtag in obj[i]['hashtags']: + if hashtag['name'] in hashtags: + hashtags[hashtag['name']].add(i) + else: + hashtags[hashtag['name']] = {i} + return hashtags + + +def get_occurrences(filename, n=1 , sort=True): + with open(filename) as f: + obj = json.load(f) + l = len(obj) + tags = get_hashtags(obj) + tags = {key: (len(value), value) for (key, value) in tags.items()} + if not sort: + k = list(tags.keys()) + v = list(tags.values()) + return obj, k, v + else: + sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} + k = list(sorted_tags.keys()) + v = list(sorted_tags.values()) + k = k[:n] + v_total = [i[0] for i in v] + v_total = v_total[:n] + return l, k, v_total + + + +def plot(n, length, k, v): + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {n} hashtags from {length} posts.') + plt.ylabel(f'Number of occurrences') + plt.show() + return + + +def print_occurrences(l, k, v): + row_number = 0 + total_posts = l + print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))')) + #print(f'Hashtag Occurrences Frequency(Occurances/Total-Posts)') + for key,value in zip(k, v): + ratio = value/total_posts + print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio)) + #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}') + row_number += 1 + return + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="The json hashtag file name") + parser.add_argument("n", help="The number of top n occurrences", type=int) + parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true") + parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true") + args = parser.parse_args() + if args.input_file and args.n: + if args.n < 1: + print(f"Please make sure the number of top occurrences is a positive integer.") + sys.exit() + + base = os.path.splitext(args.input_file)[0] + path = f"./{base}_sorted_hashtags.csv" + if args.plot: + length, keys, values = get_occurrences(args.input_file, args.n) + plot(args.n, length, keys, values) + else: + length, keys, values = get_occurrences(args.input_file, args.n) + print_occurrences(length, keys, values) + else: + print(f'ERROR: either {args.input_file} or {args.n} or both contains error.') diff --git a/analytics/logging_analytics.py b/analytics/logging_analytics.py new file mode 100644 index 0000000..cba8ca5 --- /dev/null +++ b/analytics/logging_analytics.py @@ -0,0 +1,4 @@ +""" +Yet to be written ... +""" + diff --git a/tiktok_downloader/data_methods.py b/tiktok_downloader/data_methods.py new file mode 100644 index 0000000..c35e2a4 --- /dev/null +++ b/tiktok_downloader/data_methods.py @@ -0,0 +1,123 @@ +import os +from collections import namedtuple +from datetime import datetime +import global_data +import file_methods + + +Difference = namedtuple("Difference", "new_ids size") +Total = namedtuple("Total", "total unique") + + +def get_difference(tag, file, ids): + maiden_entry = False + current_id_data = file_methods.get_data(file) + if tag in current_id_data: + current_ids = current_id_data[tag] + set1 = set(current_ids) + set2 = set(ids) + new_ids = set2.difference(set1) + if new_ids: + new_ids = list(new_ids) + size = len(new_ids) + diff = Difference(new_ids, size) + return (diff, maiden_entry) + else: + return ([], maiden_entry) + else: + maiden_entry = True + return (ids, maiden_entry) + + +def extract_posts(settings, file_name, tag): + ids = [] + posts = [] + new_posts = [] + + posts = file_methods.get_data(file_name) + for post in posts: + ids.append(post["id"]) + if not ids: + print(f"WARNING: no posts were found for {tag} in the file - {file_name}") + return + + status = file_methods.check_existence(settings["post_ids"], "file") + if not status: + new_data = (ids, posts) + return new_data + else: + res = get_difference(tag, settings["post_ids"], ids) + if res[1]: + new_data = (ids, posts) + return new_data + else: + if res[0]: + for i in res[0].new_ids: + for post in posts: + if (i == post["id"]): + new_posts.append(post) + new_data = (res[0].new_ids, new_posts) + return new_data + else: + print(f"WARNING: No new posts were found in the downloaded file - {file_name}") + return + + +def extract_videos(settings, tag, download_list): + status = file_methods.check_existence(settings["video_ids"], "file") + if not status: + new_data = download_list + return new_data + else: + res = get_difference(tag, settings["video_ids"], download_list) + if res[1]: + return download_list + else: + if res[0]: + new_data = res[0].new_ids + return new_data + else: + print(f"WARNING: No new videos were found for the {tag} in the downloaded folder.") + return + + +def update_posts(file_path, file_type, new_data, tag=None): + try: + status = file_methods.check_existence(file_path, file_type) + if not tag: + file_methods.post_writer(file_path, new_data, status) + else: + log = file_methods.id_writer(file_path, new_data, tag, status) + return log + except: + raise + + +def update_videos(settings, new_data, tag): + file_path = settings["video_ids"] + file_methods.check_file(file_path, "file") + log = file_methods.id_writer(file_path, new_data, tag, True) + file_methods.clean_video_files(settings, tag, new_data) + return log + + +def get_total_posts(file_path, tag): + status = file_methods.check_existence(file_path, "file") + if not status: + raise OSError("{file_path} not found!") + else: + data = file_methods.get_data(file_path) + total = len(data[tag]) + unique = len(set(data[tag])) + total = Total(total, unique) + return total + + +def print_total(file_path, tag, data_type): + total = get_total_posts(file_path, tag) + if (total.total == total.unique): + print(f"Total {data_type} for the hashtag {tag} are: {total.total}") + return + else: + print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...") + return diff --git a/tiktok_downloader/file_methods.py b/tiktok_downloader/file_methods.py new file mode 100644 index 0000000..8842f07 --- /dev/null +++ b/tiktok_downloader/file_methods.py @@ -0,0 +1,201 @@ +import os, json, subprocess +from datetime import datetime +import global_data +import data_methods + + +def create_file(name, file_type): + if (file_type == "dir"): + os.makedirs(name, mode=0o777) + elif (file_type == "file"): + with open(name, "w"): pass + else: + print(f"ERROR: either {file_type} or is not well defined.") + return + + +def check_existence(file_path, file_type): + if (file_type == "file"): + if os.path.isfile(file_path): + return True + else: + return False + elif (file_type == "dir"): + if os.path.isdir(file_path): + return True + else: + return False + else: + raise OSError(f"{file_type} has to be a 'dir' or a 'file'!!!") + + +def check_file(file_path, file_type): + status = check_existence(file_path, file_type) + if not status: + create_file(file_path, file_type) + + return + + +def download_posts(settings, tag): + path = os.path.join(settings["data"], tag, settings["posts"]) + os.chdir(path) + try: + tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" + result = subprocess.run([tiktok_command], capture_output=True, shell=True) + if result.stdout: + new_file = result.stdout.decode('utf-8').split()[-1] + if ("json" in new_file): + os.chdir("../../../tiktok_downloader") + return new_file + else: + print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!") + os.chdir("../../../tiktok_downloader") + return + else: + os.chdir("../../../tiktok_downloader") + print(f"ERROR: No file was downloaded by the tiktok-scraper for the {tag} !!!!") + return + except: raise + + + +def download_videos(settings, tag): + path = os.path.join(settings["data"], tag, settings["videos"]) + os.chdir(path) + try: + tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" + result = subprocess.run([tiktok_command], capture_output=True, shell=True) + if result.stdout: + downloaded_list_tmp = os.listdir(f"./#{tag}") + if downloaded_list_tmp: + downloaded_list = [] + for file in downloaded_list_tmp: + file = file[0:-4] + downloaded_list.append(file) + + os.chdir("../../../tiktok_downloader") + return downloaded_list + else: + print(f"WARNING: No video files were downloaded for the hashtag {tag}.") + os.chdir("../../../tiktok_downloader") + subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True) + else: + os.chdir("../../../tiktok_downloader") + print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!") + return + + except: raise + + +def get_data(file_path): + with open(file_path, "r") as f: + data = json.load(f) + return data + + +def dump_data(file_path, data): + with open(file_path, "w") as f: + json.dump(data, f) + return + +def log_writer(log_data): + total = 0 + try: + log_dict = {} + for ele in log_data: + if ele[0] in log_dict: + if ele[1][0] in log_dict[ele[0]]: + log_dict[ele[0]][ele[1][0]] += ele[1][1] + else: + log_dict[ele[0]][ele[1][0]] = ele[1][1] + total += ele[1][1] + else: + log_dict[ele[0]] = { ele[1][0] : ele[1][1] } + total += ele[1][1] + + logger = global_data.FILES["logger"] + now = datetime.now() + now_str = now.strftime("%d-%m-%Y %H:%M:%S") + status = check_existence(logger, "file") + if status: + data = get_data(logger) + data[now_str] = log_dict + dump_data(logger, data) + else: + data = { now_str : log_dict } + dump_data(logger, data) + print(f"Successfully logged {total} entries!!!!") + return + except: raise + + +def id_writer(file_path, new_data, tag, status): + try: + total = len(new_data) + if status: + try: + data = get_data(file_path) + if tag in data: + data[tag] += new_data + else: + data[tag]= new_data + dump_data(file_path, data) + except json.decoder.JSONDecodeError: + data = { tag : new_data } + dump_data(file_path, data) + else: + data = { tag : new_data } + dump_data(file_path, data) + print(f"SUCCESS - {total} entries added to {file_path}!!!") + log_data = (tag, total) + return log_data + except: raise + + +def post_writer(file_path, new_data, status): + try: + total = len(new_data) + if status: + try: + data = get_data(file_path) + data += new_data + dump_data(file_path, data) + except json.decoder.JSONDecodeError: + data = new_data + dump_data(file_path, data) + else: + data = new_data + dump_data(file_path, data) + print(f"SUCCESS - {total} entries added to {file_path}!!!") + return + except: raise + + +def delete_file(file_path, file_type): + if not check_existence(file_path, file_type): + print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!") + elif (file_type == "file"): + os.remove(file_path) + print(f"Successfully deleted {file_path}!!!") + return + elif (file_type == "dir"): + os.rmdir(file_path) + print(f"Successfully deleted {file_path}!!!") + return + else: + print(f"ERROR: {file_type} needs to be either 'file' or 'dir' !!!") + return + + +def clean_video_files(settings, tag, new_data=None): + try: + if new_data: + for file in new_data: + settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4" + subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True) + + subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True) + print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.") + except: + raise diff --git a/tiktok_downloader/global_data.py b/tiktok_downloader/global_data.py new file mode 100644 index 0000000..b83df45 --- /dev/null +++ b/tiktok_downloader/global_data.py @@ -0,0 +1,38 @@ +# Directories +DATA = "../data" +IDS = "ids" +LOG = "log" +POSTS = "posts" +VIDEOS = "videos" + +# Files +POST_IDS = "post_ids.json" +VIDEO_IDS = "video_ids.json" +DATA_FILE = "data.json" +LOG_FILE = "log.json" + + +FILES = { + "data" : DATA, + "ids" : IDS, + "log" : LOG, + "posts" : POSTS, + "videos" : VIDEOS, + "post_ids" : f"{DATA}/{IDS}/{POST_IDS}", + "video_ids" : f"{DATA}/{IDS}/{VIDEO_IDS}", + "data_file" : f"{DATA_FILE}", + "downloads" : [], + "logger" : f"{DATA}/{LOG}/{LOG_FILE}", + } + + + +# Commands +tag = "" + +COMMANDS = { + "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper. + "post_download" : f"tiktok-scraper hashtag {tag} -t 'json'", + "video_download" : f"tiktok-scraper hashtag {tag} -d", + "sleep" : 8 + } diff --git a/tiktok_downloader/hashtag_list.py b/tiktok_downloader/hashtag_list.py new file mode 100644 index 0000000..e595523 --- /dev/null +++ b/tiktok_downloader/hashtag_list.py @@ -0,0 +1,37 @@ +hashtag_list = [ +# This is a sample hashtag list. Please enter your hashtag list (without the comment). +# "london", +# "paris", +# "newyork", +# "tokyo" + "uyghur", + "uyghur2021", + "uyghur2022", + "uyghurmuslims", + "xinjiang", + "xinjiangchina", + "xinjiangcotton", + "xinjiangtravel", + "uyghurlivesmatter", + "uighur", + "Uighurs", + "Uyghurs", + "uighuren", + "saveuyghur", + "uighurmuslims", + "chinesemuslim", + "uyghurpeople", + "urumqi", + "chinaxinjiang", + "xinjianguyghurs", + "eastturkestan", + "chinaconcentrationcamp", + "xinjianguyghur🇨🇳", + "kashgar", + "xinjiangreeducationcamps", + "uyghur_tiktok", + "uyghurreality", + "xinjiangdance", + "westernmedia", + "uyghurgenocide" + ] diff --git a/tiktok_downloader/hashtag_list_sample.py b/tiktok_downloader/hashtag_list_sample.py new file mode 100644 index 0000000..4ddff1a --- /dev/null +++ b/tiktok_downloader/hashtag_list_sample.py @@ -0,0 +1,8 @@ +hashtag_list = [ +# This is a sample hashtag list. Please enter your hashtag list (without the comment). + "london", + "paris", + "newyork", + "tokyo" + + ] diff --git a/tiktok_downloader/run_downloader.py b/tiktok_downloader/run_downloader.py new file mode 100644 index 0000000..d4ccffe --- /dev/null +++ b/tiktok_downloader/run_downloader.py @@ -0,0 +1,212 @@ +import os, sys +import time +import json +import argparse + +import global_data +import file_methods +import data_methods + + + +command = "python3 post_downloader.py " + +def get_hashtag_list(): + try: + from hashtag_list import hashtag_list + return hashtag_list + except ImportError as error: + print("ImportError: " + str(error)) + print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the list hashtag_list in the file hashtag_list.py") + sys.exit() + + +def create_parser(): + # Creating the parser + parser = argparse.ArgumentParser(description="Download the tiktoks for the requested hashtags") + + # Adding the arguments + #parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags") + parser.add_argument("--h", type=str, nargs="*", help="List of hashtags") + parser.add_argument("-p", action="store_true", help="Download posts") + parser.add_argument("-v", action="store_true", help="Download videos") + + return parser + + +def set_download_settings(download_data_type): + settings = {} + settings["data"] = global_data.FILES["data"] + settings["ids"] = global_data.FILES["ids"] + settings["log"] = global_data.FILES["log"] + settings["logger"] = global_data.FILES["logger"] + settings["sleep"] = global_data.COMMANDS["sleep"] + file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir") + file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir") + if download_data_type == "posts": + settings["posts"] = global_data.FILES["posts"] + settings["post_ids"] = global_data.FILES["post_ids"] + settings["post_download"] = global_data.COMMANDS["post_download"] + settings["data_file"] = global_data.FILES["data_file"] + return settings + elif download_data_type == "videos": + settings["videos"] = global_data.FILES["videos"] + settings["video_ids"] = global_data.FILES["video_ids"] + settings["video_download"] = global_data.COMMANDS["video_download"] + settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"] + return settings + elif download_data_type == "posts-videos": + settings["posts"] = global_data.FILES["posts"] + settings["post_ids"] = global_data.FILES["post_ids"] + settings["data_file"] = global_data.FILES["data_file"] + settings["post_download"] = global_data.COMMANDS["post_download"] + settings["videos"] = global_data.FILES["videos"] + settings["video_ids"] = global_data.FILES["video_ids"] + settings["video_download"] = global_data.COMMANDS["video_download"] + settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"] + return settings + else: + print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.") + sys.exit() + + + +def get_posts(settings, tag): + file_path = file_methods.download_posts(settings, tag) + log = () + if file_path: + new_data = data_methods.extract_posts(settings, file_path, tag) + if new_data: + data_file = os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]) + data_methods.update_posts(data_file, "file", new_data[1]) + log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag) + file_methods.delete_file(file_path, "file") + + return log + + + +def get_videos(settings, tag): + log = () + download_list = file_methods.download_videos(settings, tag) + if download_list: + new_data = data_methods.extract_videos(settings, tag, download_list) + if new_data: + log = data_methods.update_videos(settings, new_data, tag) + else: + file_methods.clean_video_files(settings, tag) + return log + + + +def get_data(hashtags, download_data_type): + counter = 0 + total_hashtags = len(hashtags) + total_hashtags_offset = total_hashtags - 1 + log_data = [] + + if download_data_type == "posts": + settings = set_download_settings(download_data_type) + while counter < total_hashtags: + tag = hashtags[counter] + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir") + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file") + res = get_posts(settings, tag) + if res: + log = ( res[0], ( "posts", res[1] ) ) + log_data.append(log) + data_methods.print_total(settings["post_ids"], tag, download_data_type) + + counter += 1 + if counter < total_hashtags_offset: + time.sleep(settings["sleep"]) + elif download_data_type == "videos": + settings = set_download_settings(download_data_type) + while counter < total_hashtags: + tag = hashtags[counter] + file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir") + settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}" + settings["videos_to"] = settings['data'] + f"/{tag}/videos" + res = get_videos(settings, tag) + if res: + res = ( res[0], ( "videos", res[1])) + log_data.append(res) + data_methods.print_total(settings["video_ids"], tag, download_data_type) + + counter += 1 + if counter < total_hashtags_offset: + time.sleep(settings["sleep"]) + elif download_data_type == "posts-videos": + settings = set_download_settings(download_data_type) + while counter < total_hashtags: + tag = hashtags[counter] + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir") + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file") + file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir") + settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}" + settings["videos_to"] = settings['data'] + f"/{tag}/videos" + requests = [("posts", "post_ids", get_posts), ("videos", "video_ids", get_videos)] + total_reqs_offset = len(requests) - 1 + req_counter = 0 + for req in requests: + res = req[2](settings, tag) + if res: + res = ( res[0], (req[0], res[1]) ) + log_data.append(res) + data_methods.print_total(settings[req[1]], tag, req[0]) + + if req_counter < total_reqs_offset: + time.sleep(settings["sleep"]) + req_counter += 1 + + counter += 1 + if counter < total_hashtags_offset: + time.sleep(settings["sleep"]) + else: + print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.") + sys.exit() + return log_data + + +def get_hashtags(file_name, hashtag_list): + try: + from hashtag_list import hashtag_list + return hashtag_list + except: + print(f"ERROR: something went wrong while reading the file {file_name}!") + raise + + +if __name__ == "__main__": + parser = create_parser() + args = parser.parse_args() + + if not (args.p or args.v): + parser.error("No argument given, please specify either -p for posts or -v videos or both.") + sys.exit() + + if args.h: + hashtags = args.h + else: + hashtags = get_hashtags("hashtag_list", "hashtag_list") + + print(hashtags) + if not hashtags: + hashtags = get_hashtag_list() + if not hashtags: + print(f"ERROR: No hashtags found. Please re-run the script with at least one hashtag!!!") + sys.exit(0) + + if (args.p and args.v): + download_data_type = "posts-videos" + elif args.p: + download_data_type = "posts" + else: + download_data_type = "videos" + + try: + log_data = get_data(hashtags, download_data_type) + if log_data: + file_methods.log_writer(log_data) + except: + raise