diff --git a/extract_hashtag.py b/extract_hashtag.py index 2769e3d..0b9e1fc 100644 --- a/extract_hashtag.py +++ b/extract_hashtag.py @@ -1,5 +1,6 @@ import os, sys import csv, json +import argparse import matplotlib.pyplot as plt @@ -20,56 +21,67 @@ def get_hashtags(obj): return hashtags -def create_csv(file_name, path, d): - with open(path, "w") as f: - f.write(f"Name, Occurances, Positions" + "\n") - for key,value in d.items(): - f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n") - print(f'The sorted hashtag occcurances list is contained in the file {path}.') - return None - - -def plot_occurances(file_name, plots): - base = os.path.splitext(file_name)[0] - path = f"./{base}_sorted_hashtags.csv" - if os.path.exists(path): - print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.') - return - else: - with open(file_name) as f: - obj = json.load(f) - l = len(obj) - tags = get_hashtags(obj) - tags = {key: (len(value), value) for (key, value) in tags.items()} +def get_occurances(filename, n=1 , sort=True): + with open(filename) as f: + obj = json.load(f) + l = len(obj) + tags = get_hashtags(obj) + tags = {key: (len(value), value) for (key, value) in tags.items()} + if not sort: + k = list(tags.keys()) + v = list(tags.values()) + return obj, k, v + else: sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} - create_csv(file_name, path, sorted_tags) k = list(sorted_tags.keys()) v = list(sorted_tags.values()) - v = [i[0] for i in v] - k = k[:plots] - v = v[:plots] - plt.scatter(k, v) - plt.tight_layout() - plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {plots} hashtags from {l} posts.') - plt.ylabel(f'Number of occurances') - plt.show() - return + k = k[:n] + v_total = [i[0] for i in v] + v_total = v_total[:n] + return l, k, v_total + + + +def plot(n, length, k, v): + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {n} hashtags from {length} posts.') + plt.ylabel(f'Number of occurances') + plt.show() + return + + +def print_occurances(k, v): + row_number = 0 + print(f'Hashtag Occurances') + for key,value in zip(k, v): + print(f'{row_number}\t{key}\t\t{value}') + row_number += 1 + return + if __name__ == "__main__": - if len(sys.argv) != 3: - print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') - sys.exit() + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="The json hashtag file name") + parser.add_argument("n", help="The number of top n occurances", type=int) + parser.add_argument("-p", "--plot", help="Plot the occurances", action="store_true") + parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true") + args = parser.parse_args() + if args.input_file and args.n: + if args.n < 1: + print(f"Please make sure the number of top occurances is a positive integer.") + sys.exit() + + base = os.path.splitext(args.input_file)[0] + path = f"./{base}_sorted_hashtags.csv" + if args.plot: + length, keys, values = get_occurances(args.input_file, args.n) + plot(args.n, length, keys, values) + else: + length, keys, values = get_occurances(args.input_file, args.n) + print_occurances(keys, values) else: - try: - int(sys.argv[2]) - except: - print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') - raise - - try: - plot_occurances(sys.argv[1], int(sys.argv[2])) - except: - print("Unexpected error:", sys.exc_info()[0]) - raise + print(f'ERROR: either {args.input_file} or {args.n} or both contains error.') + diff --git a/extract_posts.py b/extract_posts.py index c4bb435..721393b 100644 --- a/extract_posts.py +++ b/extract_posts.py @@ -1,64 +1,49 @@ import os, sys -import csv, json -import re -from pandas import * - -def arg_check(): - if len(sys.argv) != 3: - print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag') - sys.exit() - else: - return - -def get_hashtag_positions(file_name, hashtag): - base = os.path.splitext(file_name)[0] - path = f"./{base}_sorted_hashtags.csv" - if not os.path.exists(path): - print(f'Generating {path} ...') - os.system(f'python3 extract_hashtag.py {file_name} {1}') - - return tag_membership(hashtag, path) +from extract_hashtag import get_occurances -def tag_membership(hashtag, path): - data = read_csv(path) - position_str = list(data[data["Name"] == hashtag].values[:, 2]) - if position_str: - position_str = re.split('{|}', str(position_str))[1] - p = position_str.replace(";", ",") - positions = [int(s) for s in p.split(",")] - return positions - else: - return +def filter_positions(hashtags, keys, positions): + filtered = [] + for hashtag in hashtags: + try: + i = keys.index(hashtag) + key = keys[i] + post_indices = positions[i][1] + filtered.append((key, post_indices)) + except Exception as error: + print(error) + continue + return filtered -def print_posts(file_name, path, hashtag, positions): - with open(file_name) as f: - data = json.load(f) - posts = [] - for p in positions: - posts.append(data[p]) - keys = posts[0].keys() - with open(path, 'w', newline='') as csv_file: - writer = csv.DictWriter(csv_file, keys) - writer.writeheader() - writer.writerows(posts) - print(f'The posts are contained in the file {path}.') - return +def write_posts(path, obj, filtered): + length = len(filtered) + with open(path, "w") as output_file: + for i in range(length): + hashtag = filtered[i][0] + total_positions = len(filtered[i][1]) + positions = list(filtered[i][1]) + first_position = positions[0] + output_file.write(f"{hashtag}, {obj[first_position]}" + "\n") + for p in range(1, total_positions): + output_file.write(f" , {obj[positions[p]]}" + "\n") + print(f"{total_positions} posts written for the hashtag - {hashtag}") if __name__ == "__main__": - arg_check() file_name = sys.argv[1] - hashtag = sys.argv[2] - path = f"./{hashtag}_posts.csv" + hashtags = list(sys.argv[2:]) + name = f"{hashtags[0]}_{len(hashtags)}" + path = f"../{name}_posts.csv" if os.path.exists(path): print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.') sys.exit() else: - positions = get_hashtag_positions(file_name, hashtag) - if positions: - print_posts(file_name, path, hashtag, positions) + obj, keys, positions = get_occurances(file_name, sort=False) + filtered = filter_positions(hashtags, keys, positions) + if filtered: + write_posts(path, obj, filtered) else: - print(f'{hashtag} not found!!!!') - sys.exit() + print(f"No posts found for the hashtags you entered.") + + diff --git a/top_hashtag_occurances.py b/top_hashtag_occurances.py new file mode 100644 index 0000000..dcee5aa --- /dev/null +++ b/top_hashtag_occurances.py @@ -0,0 +1,83 @@ +#!/usr/bin/python3 + +import os, time +import json +import argparse +from datetime import datetime + + +def parser(): + parser = argparse.ArgumentParser() + parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+") + parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int) + args = parser.parse_args() + return args + + +def check_file_existence(hashtag, contains=None): + pwd = "./" + for i in os.listdir(pwd): + #if os.path.isfile(os.path.join(pwd, i)) and hashtag in i: + if hashtag in i and contains in i: + return i + elif hashtag in i: + return i + else: + continue + return + + +def get_input_file(hashtag): + check_file = check_file_existence(hashtag, "json") + if check_file: + return check_file + else: + try: + os.system(f"tiktok-scraper hashtag {hashtag} -t json") + c = check_file_existence(hashtag, "json") + if c: + return c + else: + print(f"ERROR: No json file relating to {hashtag} found.") + except: + raise + + +def copy_data(input_file, output_file): + os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}") + return + + +def get_data(hashtag, n): + input_file = get_input_file(hashtag) + if input_file: + os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o") + base = os.path.splitext(input_file)[0] + data_file = f"{base}_sorted_hashtags.csv" + if os.path.exists(data_file): + return data_file + return + + +def get_occurances(hashtag, n, output): + data_file = get_data(hashtag, n) + copy_data(data_file, output) + os.system(f"rm {data_file}") + print(f"{data_file} removed ....") + + +if __name__ == "__main__": + args = parser() + hashtags = args.hashtags + now = datetime.now().strftime("%d%m%Y-%H%M%S") + output = f"./{now}.csv" + l = len(hashtags) + if l > 1: + sleep = 30 # Sleep time (in secs) between two tiktok scraping requests. + get_occurances(hashtags[0], args.top_n, output) + for i in range(1, l): + time.sleep(30) + get_occurances(hashtags[i], args.top_n, output) + else: + get_occurances(hashtags[0], args.top_n, output) + print(f"The output data is stored in the file {output}")