This commit is contained in:
X
2022-02-07 01:32:23 +01:00
7 changed files with 162 additions and 0 deletions

19
README.md Normal file
View File

@@ -0,0 +1,19 @@
# TikTok hashtag analysis toolset
The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper).
## Pre-requisites
1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper hashtag tokyo2021 -t 'json'
## extract_date.py
1. Use the following command: python3 extract_date.py target_file.json hashtag_name
2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the '.json' file.
## extract_hashtag.py
1. Use the following command to print the result on the screen: python3 extract_hashtag.py target_file.json n -d
2. Use the following command to plot: python3 extract_hashtag.py target_file.json n -p
3. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze.
## extract_posts.py
1. Use the following command: python3 extract_posts.py target_file.json hashtag_names
2. The command above pulls out all the posts for the hashtag hashtag_names (enter multiple names with space) from the downloaded tiktok scraper data.

View File

@@ -2,10 +2,13 @@ import os, sys
import csv, json
import argparse
import matplotlib.pyplot as plt
<<<<<<< HEAD
from datetime import datetime
sys.path.insert(0, '../tiktok_downloader')
import file_methods, global_data
=======
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
@@ -46,6 +49,7 @@ def get_occurrences(filename, n=1 , sort=True):
<<<<<<< HEAD
def plot(n, length, k, v, img_folder):
plt.scatter(k, v)
plt.tight_layout()
@@ -55,6 +59,15 @@ def plot(n, length, k, v, img_folder):
plt.ylabel(f'Number of occurrences')
save_plot(plt, img_folder)
plt.show(block=None)
=======
def plot(n, length, k, v):
plt.scatter(k, v)
plt.tight_layout()
plt.title(f'Hashtag Distribution')
plt.xlabel(f'Top {n} hashtags from {length} posts.')
plt.ylabel(f'Number of occurrences')
plt.show()
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
return
@@ -71,6 +84,7 @@ def print_occurrences(l, k, v):
return
<<<<<<< HEAD
def save_plot(plt, img_folder):
try:
now = datetime.now()
@@ -85,6 +99,10 @@ def save_plot(plt, img_folder):
if __name__ == "__main__":
img_folder = global_data.IMAGES
file_methods.check_file(img_folder, "dir")
=======
if __name__ == "__main__":
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help="The json hashtag file name")
parser.add_argument("n", help="The number of top n occurrences", type=int)
@@ -100,7 +118,11 @@ if __name__ == "__main__":
path = f"./{base}_sorted_hashtags.csv"
if args.plot:
length, keys, values = get_occurrences(args.input_file, args.n)
<<<<<<< HEAD
plot(args.n, length, keys, values, img_folder)
=======
plot(args.n, length, keys, values)
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
else:
length, keys, values = get_occurrences(args.input_file, args.n)
print_occurrences(length, keys, values)

View File

@@ -64,8 +64,12 @@ def download_videos(settings, tag):
path = os.path.join(settings["data"], tag, settings["videos"])
os.chdir(path)
try:
<<<<<<< HEAD
# tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d"
tiktok_command = f"tiktok-scraper hashtag {tag} -d"
=======
tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d"
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
result = subprocess.run([tiktok_command], capture_output=True, shell=True)
if result.stdout:
downloaded_list_tmp = os.listdir(f"./#{tag}")

View File

@@ -4,7 +4,10 @@ IDS = "ids"
LOG = "log"
POSTS = "posts"
VIDEOS = "videos"
<<<<<<< HEAD
IMAGES = f"{DATA}/img"
=======
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
# Files
POST_IDS = "post_ids.json"
@@ -19,7 +22,10 @@ FILES = {
"log" : LOG,
"posts" : POSTS,
"videos" : VIDEOS,
<<<<<<< HEAD
"images" : IMAGES,
=======
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
"post_ids" : f"{DATA}/{IDS}/{POST_IDS}",
"video_ids" : f"{DATA}/{IDS}/{VIDEO_IDS}",
"data_file" : f"{DATA_FILE}",
@@ -32,8 +38,15 @@ FILES = {
# Commands
tag = ""
<<<<<<< HEAD
PARAMETERS = {
"scraper_attempts" : 3,
# "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper.
=======
COMMANDS = {
"number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper.
"post_download" : f"tiktok-scraper hashtag {tag} -t 'json'",
"video_download" : f"tiktok-scraper hashtag {tag} -d",
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
"sleep" : 8
}

View File

@@ -1,7 +1,44 @@
hashtag_list = [
# This is a sample hashtag list. Please enter your hashtag list (without the comment).
<<<<<<< HEAD
"london",
"paris",
"newyork",
"tokyo"
=======
# "london",
# "paris",
# "newyork",
# "tokyo"
"uyghur",
"uyghur2021",
"uyghur2022",
"uyghurmuslims",
"xinjiang",
"xinjiangchina",
"xinjiangcotton",
"xinjiangtravel",
"uyghurlivesmatter",
"uighur",
"Uighurs",
"Uyghurs",
"uighuren",
"saveuyghur",
"uighurmuslims",
"chinesemuslim",
"uyghurpeople",
"urumqi",
"chinaxinjiang",
"xinjianguyghurs",
"eastturkestan",
"chinaconcentrationcamp",
"xinjianguyghur🇨🇳",
"kashgar",
"xinjiangreeducationcamps",
"uyghur_tiktok",
"uyghurreality",
"xinjiangdance",
"westernmedia",
"uyghurgenocide"
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
]

View File

@@ -0,0 +1,8 @@
hashtag_list = [
# This is a sample hashtag list. Please enter your hashtag list (without the comment).
"london",
"paris",
"newyork",
"tokyo"
]

View File

@@ -1,7 +1,11 @@
import os, sys
import time
import json
<<<<<<< HEAD
import argparse, importlib
=======
import argparse
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
import global_data
import file_methods
@@ -11,6 +15,7 @@ import data_methods
command = "python3 post_downloader.py "
<<<<<<< HEAD
def get_hashtag_list(file_name):
try:
f = importlib.import_module(file_name) # exec(f"from {file_name} import hashtag_list")
@@ -19,6 +24,15 @@ def get_hashtag_list(file_name):
except ImportError as error:
print("ImportError: " + str(error))
print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the variable hashtag_list in the file {file_name}")
=======
def get_hashtag_list():
try:
from hashtag_list import hashtag_list
return hashtag_list
except ImportError as error:
print("ImportError: " + str(error))
print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the list hashtag_list in the file hashtag_list.py")
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
sys.exit()
@@ -29,7 +43,10 @@ def create_parser():
# Adding the arguments
#parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags")
parser.add_argument("--h", type=str, nargs="*", help="List of hashtags")
<<<<<<< HEAD
parser.add_argument("-f", type=str, help="File name with the list of hashtags")
=======
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
parser.add_argument("-p", action="store_true", help="Download posts")
parser.add_argument("-v", action="store_true", help="Download videos")
@@ -42,25 +59,46 @@ def set_download_settings(download_data_type):
settings["ids"] = global_data.FILES["ids"]
settings["log"] = global_data.FILES["log"]
settings["logger"] = global_data.FILES["logger"]
<<<<<<< HEAD
settings["sleep"] = global_data.PARAMETERS["sleep"]
settings["scraper"] = global_data.PARAMETERS["scraper_attempts"]
=======
settings["sleep"] = global_data.COMMANDS["sleep"]
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir")
if download_data_type == "posts":
settings["posts"] = global_data.FILES["posts"]
settings["post_ids"] = global_data.FILES["post_ids"]
<<<<<<< HEAD
=======
settings["post_download"] = global_data.COMMANDS["post_download"]
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
settings["data_file"] = global_data.FILES["data_file"]
return settings
elif download_data_type == "videos":
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
<<<<<<< HEAD
=======
settings["video_download"] = global_data.COMMANDS["video_download"]
settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"]
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
return settings
elif download_data_type == "posts-videos":
settings["posts"] = global_data.FILES["posts"]
settings["post_ids"] = global_data.FILES["post_ids"]
settings["data_file"] = global_data.FILES["data_file"]
<<<<<<< HEAD
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
=======
settings["post_download"] = global_data.COMMANDS["post_download"]
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
settings["video_download"] = global_data.COMMANDS["video_download"]
settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"]
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
return settings
else:
print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
@@ -92,7 +130,10 @@ def get_videos(settings, tag):
log = data_methods.update_videos(settings, new_data, tag)
else:
file_methods.clean_video_files(settings, tag)
<<<<<<< HEAD
=======
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
return log
@@ -179,6 +220,7 @@ if __name__ == "__main__":
parser = create_parser()
args = parser.parse_args()
<<<<<<< HEAD
if not (args.h or args.f):
parser.error("No hashtags were given, please use either --h option or -f to provide hashtags.")
sys.exit()
@@ -197,6 +239,23 @@ if __name__ == "__main__":
if not hashtags:
print("No hashtags were given, please use either --h option or -f to provide hashtags.")
sys.exit(0)
=======
if not (args.p or args.v):
parser.error("No argument given, please specify either -p for posts or -v videos or both.")
sys.exit()
if args.h:
hashtags = args.h
else:
hashtags = get_hashtags("hashtag_list", "hashtag_list")
print(hashtags)
if not hashtags:
hashtags = get_hashtag_list()
if not hashtags:
print(f"ERROR: No hashtags found. Please re-run the script with at least one hashtag!!!")
sys.exit(0)
>>>>>>> bfa90676f121dd88e070dc134791a596a104e784
if (args.p and args.v):
download_data_type = "posts-videos"