diff --git a/extract_hashtag.py b/extract_hashtag.py index 380c174..2769e3d 100644 --- a/extract_hashtag.py +++ b/extract_hashtag.py @@ -1,71 +1,75 @@ import os, sys import csv, json import matplotlib.pyplot as plt -from collections import Counter, OrderedDict -def get_hashtag_list(obj): + +def get_hashtags(obj): if not obj: print(f'ERROR: Empty item, no hashtags to be extracted.') return else: - hashtag_list = [] - length = len(obj) - for i in range(length): + hashtags = {} + l = len(obj) + for i in range(l): for hashtag in obj[i]['hashtags']: - hashtag_list.append(hashtag['name']) - return hashtag_list + if hashtag['name'] in hashtags: + hashtags[hashtag['name']].add(i) + else: + hashtags[hashtag['name']] = {i} + return hashtags -def create_csv(file_name, d): +def create_csv(file_name, path, d): + with open(path, "w") as f: + f.write(f"Name, Occurances, Positions" + "\n") + for key,value in d.items(): + f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n") + print(f'The sorted hashtag occcurances list is contained in the file {path}.') + return None + + +def plot_occurances(file_name, plots): base = os.path.splitext(file_name)[0] path = f"./{base}_sorted_hashtags.csv" if os.path.exists(path): - print(f'The file {path} containing hashtag occurances already exists.') - return None + print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.') + return else: - with open(path, "w") as f: - f.write(f"Name, Occurances" + "\n") - for key,value in d.items(): - f.write(f"{key}, {value}" + "\n") - print(f'The sorted hashtag occcurances list is contained in the file {path}.') - return None + with open(file_name) as f: + obj = json.load(f) + l = len(obj) + tags = get_hashtags(obj) + tags = {key: (len(value), value) for (key, value) in tags.items()} + sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} + create_csv(file_name, path, sorted_tags) + k = list(sorted_tags.keys()) + v = list(sorted_tags.values()) + v = [i[0] for i in v] + k = k[:plots] + v = v[:plots] + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {plots} hashtags from {l} posts.') + plt.ylabel(f'Number of occurances') + plt.show() + return -def plot_hashtag_occurances(file_name, plots): - with open(file_name) as f: - obj = json.load(f) - length = len(obj) - hashtag_list = get_hashtag_list(obj) - hashtags = Counter(hashtag_list).most_common() - hashtags_sorted = {k:v for (k,v) in hashtags} - create_csv(file_name, hashtags_sorted) - k = list(hashtags_sorted.keys()) - v = list(hashtags_sorted.values()) - k = k[:plots] - v = v[:plots] - plt.scatter(k, v) - plt.tight_layout() - plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {plots} hashtags from {length} posts.') - plt.ylabel(f'Number of occurances') - plt.show() - return - - - -if len(sys.argv) != 3: - print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') - sys.exit() -else: - try: - int(sys.argv[2]) - except: - print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') - raise - - try: - plot_hashtag_occurances(sys.argv[1], int(sys.argv[2])) - except: - print("Unexpected error:", sys.exc_info()[0]) - raise +if __name__ == "__main__": + if len(sys.argv) != 3: + print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') + sys.exit() + else: + try: + int(sys.argv[2]) + except: + print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') + raise + + try: + plot_occurances(sys.argv[1], int(sys.argv[2])) + except: + print("Unexpected error:", sys.exc_info()[0]) + raise diff --git a/extract_posts.py b/extract_posts.py new file mode 100644 index 0000000..c4bb435 --- /dev/null +++ b/extract_posts.py @@ -0,0 +1,64 @@ +import os, sys +import csv, json +import re +from pandas import * + +def arg_check(): + if len(sys.argv) != 3: + print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag') + sys.exit() + else: + return + +def get_hashtag_positions(file_name, hashtag): + base = os.path.splitext(file_name)[0] + path = f"./{base}_sorted_hashtags.csv" + if not os.path.exists(path): + print(f'Generating {path} ...') + os.system(f'python3 extract_hashtag.py {file_name} {1}') + + return tag_membership(hashtag, path) + + +def tag_membership(hashtag, path): + data = read_csv(path) + position_str = list(data[data["Name"] == hashtag].values[:, 2]) + if position_str: + position_str = re.split('{|}', str(position_str))[1] + p = position_str.replace(";", ",") + positions = [int(s) for s in p.split(",")] + return positions + else: + return + + +def print_posts(file_name, path, hashtag, positions): + with open(file_name) as f: + data = json.load(f) + posts = [] + for p in positions: + posts.append(data[p]) + keys = posts[0].keys() + with open(path, 'w', newline='') as csv_file: + writer = csv.DictWriter(csv_file, keys) + writer.writeheader() + writer.writerows(posts) + print(f'The posts are contained in the file {path}.') + return + + +if __name__ == "__main__": + arg_check() + file_name = sys.argv[1] + hashtag = sys.argv[2] + path = f"./{hashtag}_posts.csv" + if os.path.exists(path): + print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.') + sys.exit() + else: + positions = get_hashtag_positions(file_name, hashtag) + if positions: + print_posts(file_name, path, hashtag, positions) + else: + print(f'{hashtag} not found!!!!') + sys.exit()