diff --git a/README.md b/README.md new file mode 100644 index 0000000..fce7f73 --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# TikTok hashtag analysis toolset +The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper). + +## Pre-requisites +1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper +2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper hashtag tokyo2021 -t 'json' + +## extract_date.py +1. Use the following command: python3 extract_date.py target_file.json hashtag_name +2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the '.json' file. + +## extract_hashtag.py +1. Use the following command to print the result on the screen: python3 extract_hashtag.py target_file.json n -d +2. Use the following command to plot: python3 extract_hashtag.py target_file.json n -p +3. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze. + +## extract_posts.py +1. Use the following command: python3 extract_posts.py target_file.json hashtag_names +2. The command above pulls out all the posts for the hashtag hashtag_names (enter multiple names with space) from the downloaded tiktok scraper data. diff --git a/analytics/hashtag_frequencies.py b/analytics/hashtag_frequencies.py index 79350db..55475eb 100644 --- a/analytics/hashtag_frequencies.py +++ b/analytics/hashtag_frequencies.py @@ -2,10 +2,13 @@ import os, sys import csv, json import argparse import matplotlib.pyplot as plt +<<<<<<< HEAD from datetime import datetime sys.path.insert(0, '../tiktok_downloader') import file_methods, global_data +======= +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 @@ -46,6 +49,7 @@ def get_occurrences(filename, n=1 , sort=True): +<<<<<<< HEAD def plot(n, length, k, v, img_folder): plt.scatter(k, v) plt.tight_layout() @@ -55,6 +59,15 @@ def plot(n, length, k, v, img_folder): plt.ylabel(f'Number of occurrences') save_plot(plt, img_folder) plt.show(block=None) +======= +def plot(n, length, k, v): + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {n} hashtags from {length} posts.') + plt.ylabel(f'Number of occurrences') + plt.show() +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 return @@ -71,6 +84,7 @@ def print_occurrences(l, k, v): return +<<<<<<< HEAD def save_plot(plt, img_folder): try: now = datetime.now() @@ -85,6 +99,10 @@ def save_plot(plt, img_folder): if __name__ == "__main__": img_folder = global_data.IMAGES file_methods.check_file(img_folder, "dir") +======= + +if __name__ == "__main__": +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 parser = argparse.ArgumentParser() parser.add_argument("input_file", help="The json hashtag file name") parser.add_argument("n", help="The number of top n occurrences", type=int) @@ -100,7 +118,11 @@ if __name__ == "__main__": path = f"./{base}_sorted_hashtags.csv" if args.plot: length, keys, values = get_occurrences(args.input_file, args.n) +<<<<<<< HEAD plot(args.n, length, keys, values, img_folder) +======= + plot(args.n, length, keys, values) +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 else: length, keys, values = get_occurrences(args.input_file, args.n) print_occurrences(length, keys, values) diff --git a/tiktok_downloader/file_methods.py b/tiktok_downloader/file_methods.py index 915fb31..58d49c1 100644 --- a/tiktok_downloader/file_methods.py +++ b/tiktok_downloader/file_methods.py @@ -64,8 +64,12 @@ def download_videos(settings, tag): path = os.path.join(settings["data"], tag, settings["videos"]) os.chdir(path) try: +<<<<<<< HEAD # tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" tiktok_command = f"tiktok-scraper hashtag {tag} -d" +======= + tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 result = subprocess.run([tiktok_command], capture_output=True, shell=True) if result.stdout: downloaded_list_tmp = os.listdir(f"./#{tag}") diff --git a/tiktok_downloader/global_data.py b/tiktok_downloader/global_data.py index 767e322..409d0ee 100644 --- a/tiktok_downloader/global_data.py +++ b/tiktok_downloader/global_data.py @@ -4,7 +4,10 @@ IDS = "ids" LOG = "log" POSTS = "posts" VIDEOS = "videos" +<<<<<<< HEAD IMAGES = f"{DATA}/img" +======= +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 # Files POST_IDS = "post_ids.json" @@ -19,7 +22,10 @@ FILES = { "log" : LOG, "posts" : POSTS, "videos" : VIDEOS, +<<<<<<< HEAD "images" : IMAGES, +======= +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 "post_ids" : f"{DATA}/{IDS}/{POST_IDS}", "video_ids" : f"{DATA}/{IDS}/{VIDEO_IDS}", "data_file" : f"{DATA_FILE}", @@ -32,8 +38,15 @@ FILES = { # Commands tag = "" +<<<<<<< HEAD PARAMETERS = { "scraper_attempts" : 3, # "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper. +======= +COMMANDS = { + "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper. + "post_download" : f"tiktok-scraper hashtag {tag} -t 'json'", + "video_download" : f"tiktok-scraper hashtag {tag} -d", +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 "sleep" : 8 } diff --git a/tiktok_downloader/hashtag_list.py b/tiktok_downloader/hashtag_list.py index 83f38c1..bd44d1c 100644 --- a/tiktok_downloader/hashtag_list.py +++ b/tiktok_downloader/hashtag_list.py @@ -1,7 +1,44 @@ hashtag_list = [ # This is a sample hashtag list. Please enter your hashtag list (without the comment). +<<<<<<< HEAD "london", "paris", "newyork", "tokyo" +======= +# "london", +# "paris", +# "newyork", +# "tokyo" + "uyghur", + "uyghur2021", + "uyghur2022", + "uyghurmuslims", + "xinjiang", + "xinjiangchina", + "xinjiangcotton", + "xinjiangtravel", + "uyghurlivesmatter", + "uighur", + "Uighurs", + "Uyghurs", + "uighuren", + "saveuyghur", + "uighurmuslims", + "chinesemuslim", + "uyghurpeople", + "urumqi", + "chinaxinjiang", + "xinjianguyghurs", + "eastturkestan", + "chinaconcentrationcamp", + "xinjianguyghur🇨🇳", + "kashgar", + "xinjiangreeducationcamps", + "uyghur_tiktok", + "uyghurreality", + "xinjiangdance", + "westernmedia", + "uyghurgenocide" +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 ] diff --git a/tiktok_downloader/hashtag_list_sample.py b/tiktok_downloader/hashtag_list_sample.py new file mode 100644 index 0000000..4ddff1a --- /dev/null +++ b/tiktok_downloader/hashtag_list_sample.py @@ -0,0 +1,8 @@ +hashtag_list = [ +# This is a sample hashtag list. Please enter your hashtag list (without the comment). + "london", + "paris", + "newyork", + "tokyo" + + ] diff --git a/tiktok_downloader/run_downloader.py b/tiktok_downloader/run_downloader.py index 67e58bd..d53698f 100644 --- a/tiktok_downloader/run_downloader.py +++ b/tiktok_downloader/run_downloader.py @@ -1,7 +1,11 @@ import os, sys import time import json +<<<<<<< HEAD import argparse, importlib +======= +import argparse +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 import global_data import file_methods @@ -11,6 +15,7 @@ import data_methods command = "python3 post_downloader.py " +<<<<<<< HEAD def get_hashtag_list(file_name): try: f = importlib.import_module(file_name) # exec(f"from {file_name} import hashtag_list") @@ -19,6 +24,15 @@ def get_hashtag_list(file_name): except ImportError as error: print("ImportError: " + str(error)) print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the variable hashtag_list in the file {file_name}") +======= +def get_hashtag_list(): + try: + from hashtag_list import hashtag_list + return hashtag_list + except ImportError as error: + print("ImportError: " + str(error)) + print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the list hashtag_list in the file hashtag_list.py") +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 sys.exit() @@ -29,7 +43,10 @@ def create_parser(): # Adding the arguments #parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags") parser.add_argument("--h", type=str, nargs="*", help="List of hashtags") +<<<<<<< HEAD parser.add_argument("-f", type=str, help="File name with the list of hashtags") +======= +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 parser.add_argument("-p", action="store_true", help="Download posts") parser.add_argument("-v", action="store_true", help="Download videos") @@ -42,25 +59,46 @@ def set_download_settings(download_data_type): settings["ids"] = global_data.FILES["ids"] settings["log"] = global_data.FILES["log"] settings["logger"] = global_data.FILES["logger"] +<<<<<<< HEAD settings["sleep"] = global_data.PARAMETERS["sleep"] settings["scraper"] = global_data.PARAMETERS["scraper_attempts"] +======= + settings["sleep"] = global_data.COMMANDS["sleep"] +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir") file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir") if download_data_type == "posts": settings["posts"] = global_data.FILES["posts"] settings["post_ids"] = global_data.FILES["post_ids"] +<<<<<<< HEAD +======= + settings["post_download"] = global_data.COMMANDS["post_download"] +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 settings["data_file"] = global_data.FILES["data_file"] return settings elif download_data_type == "videos": settings["videos"] = global_data.FILES["videos"] settings["video_ids"] = global_data.FILES["video_ids"] +<<<<<<< HEAD +======= + settings["video_download"] = global_data.COMMANDS["video_download"] + settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"] +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 return settings elif download_data_type == "posts-videos": settings["posts"] = global_data.FILES["posts"] settings["post_ids"] = global_data.FILES["post_ids"] settings["data_file"] = global_data.FILES["data_file"] +<<<<<<< HEAD settings["videos"] = global_data.FILES["videos"] settings["video_ids"] = global_data.FILES["video_ids"] +======= + settings["post_download"] = global_data.COMMANDS["post_download"] + settings["videos"] = global_data.FILES["videos"] + settings["video_ids"] = global_data.FILES["video_ids"] + settings["video_download"] = global_data.COMMANDS["video_download"] + settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"] +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 return settings else: print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.") @@ -92,7 +130,10 @@ def get_videos(settings, tag): log = data_methods.update_videos(settings, new_data, tag) else: file_methods.clean_video_files(settings, tag) +<<<<<<< HEAD +======= +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 return log @@ -179,6 +220,7 @@ if __name__ == "__main__": parser = create_parser() args = parser.parse_args() +<<<<<<< HEAD if not (args.h or args.f): parser.error("No hashtags were given, please use either --h option or -f to provide hashtags.") sys.exit() @@ -197,6 +239,23 @@ if __name__ == "__main__": if not hashtags: print("No hashtags were given, please use either --h option or -f to provide hashtags.") sys.exit(0) +======= + if not (args.p or args.v): + parser.error("No argument given, please specify either -p for posts or -v videos or both.") + sys.exit() + + if args.h: + hashtags = args.h + else: + hashtags = get_hashtags("hashtag_list", "hashtag_list") + + print(hashtags) + if not hashtags: + hashtags = get_hashtag_list() + if not hashtags: + print(f"ERROR: No hashtags found. Please re-run the script with at least one hashtag!!!") + sys.exit(0) +>>>>>>> bfa90676f121dd88e070dc134791a596a104e784 if (args.p and args.v): download_data_type = "posts-videos"