diff --git a/extract_date.py b/extract_date.py deleted file mode 100644 index 788cba7..0000000 --- a/extract_date.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -import datetime -import collections -import matplotlib.pyplot as plt -import matplotlib.dates as mdates - - -if len(sys.argv) < 3: - print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag') - sys.exit() - - -def list_to_frequency(li): - if li and (type(li) == list): - return collections.Counter(li) - else: - print(f"ERROR: either {li} is empty or not a list.") - - -def eligibility_check(obj): - if not obj: - print(f'ERROR: {obj} is empty!') - return False - elif type(obj) != int: - print(f'ERROR: {obj} is not an integer as is expected!') - return False - else: - return True - -with open(sys.argv[1]) as file: - object = json.load(file) - l = len(object) - date_list = [] - for i in range(0, l): - obj = object[i]["createTime"] - if eligibility_check(obj): - dt_obj = datetime.datetime.fromtimestamp(obj) - date_list.append(dt_obj.date()) - else: - print(f'ERROR: Some error occured. Check {obj}.') - ordered = dict(list_to_frequency(date_list)) - dates = list(ordered.keys()) - total_dates = len(dates) - frequency = list(ordered.values()) - plt.scatter(dates, frequency) - plt.gcf().autofmt_xdate() - date_format = mdates.DateFormatter('%d-%m-%Y') - plt.gca().xaxis.set_major_formatter(date_format) - plt.tight_layout() - plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}') - plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)') - plt.ylabel('Posts') - plt.show() diff --git a/extract_hashtag.py b/extract_hashtag.py deleted file mode 100644 index 0b9e1fc..0000000 --- a/extract_hashtag.py +++ /dev/null @@ -1,87 +0,0 @@ -import os, sys -import csv, json -import argparse -import matplotlib.pyplot as plt - - - -def get_hashtags(obj): - if not obj: - print(f'ERROR: Empty item, no hashtags to be extracted.') - return - else: - hashtags = {} - l = len(obj) - for i in range(l): - for hashtag in obj[i]['hashtags']: - if hashtag['name'] in hashtags: - hashtags[hashtag['name']].add(i) - else: - hashtags[hashtag['name']] = {i} - return hashtags - - -def get_occurances(filename, n=1 , sort=True): - with open(filename) as f: - obj = json.load(f) - l = len(obj) - tags = get_hashtags(obj) - tags = {key: (len(value), value) for (key, value) in tags.items()} - if not sort: - k = list(tags.keys()) - v = list(tags.values()) - return obj, k, v - else: - sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} - k = list(sorted_tags.keys()) - v = list(sorted_tags.values()) - k = k[:n] - v_total = [i[0] for i in v] - v_total = v_total[:n] - return l, k, v_total - - - -def plot(n, length, k, v): - plt.scatter(k, v) - plt.tight_layout() - plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {n} hashtags from {length} posts.') - plt.ylabel(f'Number of occurances') - plt.show() - return - - -def print_occurances(k, v): - row_number = 0 - print(f'Hashtag Occurances') - for key,value in zip(k, v): - print(f'{row_number}\t{key}\t\t{value}') - row_number += 1 - return - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("input_file", help="The json hashtag file name") - parser.add_argument("n", help="The number of top n occurances", type=int) - parser.add_argument("-p", "--plot", help="Plot the occurances", action="store_true") - parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true") - args = parser.parse_args() - if args.input_file and args.n: - if args.n < 1: - print(f"Please make sure the number of top occurances is a positive integer.") - sys.exit() - - base = os.path.splitext(args.input_file)[0] - path = f"./{base}_sorted_hashtags.csv" - if args.plot: - length, keys, values = get_occurances(args.input_file, args.n) - plot(args.n, length, keys, values) - else: - length, keys, values = get_occurances(args.input_file, args.n) - print_occurances(keys, values) - else: - print(f'ERROR: either {args.input_file} or {args.n} or both contains error.') - diff --git a/extract_posts.py b/extract_posts.py deleted file mode 100644 index 721393b..0000000 --- a/extract_posts.py +++ /dev/null @@ -1,49 +0,0 @@ -import os, sys -from extract_hashtag import get_occurances - - -def filter_positions(hashtags, keys, positions): - filtered = [] - for hashtag in hashtags: - try: - i = keys.index(hashtag) - key = keys[i] - post_indices = positions[i][1] - filtered.append((key, post_indices)) - except Exception as error: - print(error) - continue - return filtered - - -def write_posts(path, obj, filtered): - length = len(filtered) - with open(path, "w") as output_file: - for i in range(length): - hashtag = filtered[i][0] - total_positions = len(filtered[i][1]) - positions = list(filtered[i][1]) - first_position = positions[0] - output_file.write(f"{hashtag}, {obj[first_position]}" + "\n") - for p in range(1, total_positions): - output_file.write(f" , {obj[positions[p]]}" + "\n") - print(f"{total_positions} posts written for the hashtag - {hashtag}") - - -if __name__ == "__main__": - file_name = sys.argv[1] - hashtags = list(sys.argv[2:]) - name = f"{hashtags[0]}_{len(hashtags)}" - path = f"../{name}_posts.csv" - if os.path.exists(path): - print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.') - sys.exit() - else: - obj, keys, positions = get_occurances(file_name, sort=False) - filtered = filter_positions(hashtags, keys, positions) - if filtered: - write_posts(path, obj, filtered) - else: - print(f"No posts found for the hashtags you entered.") - - diff --git a/top_hashtag_occurances.py b/top_hashtag_occurances.py deleted file mode 100644 index dcee5aa..0000000 --- a/top_hashtag_occurances.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/python3 - -import os, time -import json -import argparse -from datetime import datetime - - -def parser(): - parser = argparse.ArgumentParser() - parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+") - parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int) - args = parser.parse_args() - return args - - -def check_file_existence(hashtag, contains=None): - pwd = "./" - for i in os.listdir(pwd): - #if os.path.isfile(os.path.join(pwd, i)) and hashtag in i: - if hashtag in i and contains in i: - return i - elif hashtag in i: - return i - else: - continue - return - - -def get_input_file(hashtag): - check_file = check_file_existence(hashtag, "json") - if check_file: - return check_file - else: - try: - os.system(f"tiktok-scraper hashtag {hashtag} -t json") - c = check_file_existence(hashtag, "json") - if c: - return c - else: - print(f"ERROR: No json file relating to {hashtag} found.") - except: - raise - - -def copy_data(input_file, output_file): - os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}") - return - - -def get_data(hashtag, n): - input_file = get_input_file(hashtag) - if input_file: - os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o") - base = os.path.splitext(input_file)[0] - data_file = f"{base}_sorted_hashtags.csv" - if os.path.exists(data_file): - return data_file - return - - -def get_occurances(hashtag, n, output): - data_file = get_data(hashtag, n) - copy_data(data_file, output) - os.system(f"rm {data_file}") - print(f"{data_file} removed ....") - - -if __name__ == "__main__": - args = parser() - hashtags = args.hashtags - now = datetime.now().strftime("%d%m%Y-%H%M%S") - output = f"./{now}.csv" - l = len(hashtags) - if l > 1: - sleep = 30 # Sleep time (in secs) between two tiktok scraping requests. - get_occurances(hashtags[0], args.top_n, output) - for i in range(1, l): - time.sleep(30) - get_occurances(hashtags[i], args.top_n, output) - else: - get_occurances(hashtags[0], args.top_n, output) - print(f"The output data is stored in the file {output}")