mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-12 21:38:30 +03:00
84 lines
2.3 KiB
Python
84 lines
2.3 KiB
Python
#!/usr/bin/python3
|
|
|
|
import os, time
|
|
import json
|
|
import argparse
|
|
from datetime import datetime
|
|
|
|
|
|
def parser():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+")
|
|
parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int)
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def check_file_existence(hashtag, contains=None):
|
|
pwd = "./"
|
|
for i in os.listdir(pwd):
|
|
#if os.path.isfile(os.path.join(pwd, i)) and hashtag in i:
|
|
if hashtag in i and contains in i:
|
|
return i
|
|
elif hashtag in i:
|
|
return i
|
|
else:
|
|
continue
|
|
return
|
|
|
|
|
|
def get_input_file(hashtag):
|
|
check_file = check_file_existence(hashtag, "json")
|
|
if check_file:
|
|
return check_file
|
|
else:
|
|
try:
|
|
os.system(f"tiktok-scraper hashtag {hashtag} -t json")
|
|
c = check_file_existence(hashtag, "json")
|
|
if c:
|
|
return c
|
|
else:
|
|
print(f"ERROR: No json file relating to {hashtag} found.")
|
|
except:
|
|
raise
|
|
|
|
|
|
def copy_data(input_file, output_file):
|
|
os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}")
|
|
return
|
|
|
|
|
|
def get_data(hashtag, n):
|
|
input_file = get_input_file(hashtag)
|
|
if input_file:
|
|
os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o")
|
|
base = os.path.splitext(input_file)[0]
|
|
data_file = f"{base}_sorted_hashtags.csv"
|
|
if os.path.exists(data_file):
|
|
return data_file
|
|
return
|
|
|
|
|
|
def get_occurances(hashtag, n, output):
|
|
data_file = get_data(hashtag, n)
|
|
copy_data(data_file, output)
|
|
os.system(f"rm {data_file}")
|
|
print(f"{data_file} removed ....")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parser()
|
|
hashtags = args.hashtags
|
|
now = datetime.now().strftime("%d%m%Y-%H%M%S")
|
|
output = f"./{now}.csv"
|
|
l = len(hashtags)
|
|
if l > 1:
|
|
sleep = 30 # Sleep time (in secs) between two tiktok scraping requests.
|
|
get_occurances(hashtags[0], args.top_n, output)
|
|
for i in range(1, l):
|
|
time.sleep(30)
|
|
get_occurances(hashtags[i], args.top_n, output)
|
|
else:
|
|
get_occurances(hashtags[0], args.top_n, output)
|
|
print(f"The output data is stored in the file {output}")
|