From 5c5833421e22e1c03b96b0cb99269b2c29d75f6a Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Tue, 20 Jul 2021 22:18:15 +0200 Subject: [PATCH 01/23] Add files via upload --- data_processor.sh | 58 +++++++++++++++++++++++++++++++++++++++++++++++ extract_date.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 data_processor.sh create mode 100644 extract_date.py diff --git a/data_processor.sh b/data_processor.sh new file mode 100644 index 0000000..e0b4f66 --- /dev/null +++ b/data_processor.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +counter=0 + +function join_lines { + local IFS="$1" + shift + echo "$*" +} + + +while IFS= read -r line || [ -n "$line" ]; do + if [[ -z ${line} ]]; + then + : + elif [[ ${line: -1} != '"' ]]; + then + to_combine[$counter]=$line + let "counter=counter+1" + elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]]; + then + to_combine[$counter]=$line + joined=$(join_lines " " "${to_combine[@]}") + #joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined. + echo "$joined" >> tmp.csv + unset to_combine + let "counter=0" + else + echo "$line" >> tmp.csv + fi +done < "$1" + +while IFS= read -r line || [ -n "$line" ]; do + if [[ ${line: 0} == '"' ]]; + then + if [[ -f anomalies_$1 ]]; + then + echo "${line}" >> anomalies_$1 + else + touch anomalies_$1 + echo "${line}" >> anomalies_$1 + fi + else + echo "${line}" >> clean-data_$1 + fi +done < tmp.csv + +rm -f tmp.csv + +if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]]; +then + anmls=$(wc -l anomalies_$1 | awk '{print $1}') + echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1." +else + input_data=$(wc -l $1 | awk '{print $1}') + clean_lines=$(wc -l clean-data_$1 | awk '{print $1}') + echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1." +fi diff --git a/extract_date.py b/extract_date.py new file mode 100644 index 0000000..788cba7 --- /dev/null +++ b/extract_date.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime +import collections +import matplotlib.pyplot as plt +import matplotlib.dates as mdates + + +if len(sys.argv) < 3: + print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag') + sys.exit() + + +def list_to_frequency(li): + if li and (type(li) == list): + return collections.Counter(li) + else: + print(f"ERROR: either {li} is empty or not a list.") + + +def eligibility_check(obj): + if not obj: + print(f'ERROR: {obj} is empty!') + return False + elif type(obj) != int: + print(f'ERROR: {obj} is not an integer as is expected!') + return False + else: + return True + +with open(sys.argv[1]) as file: + object = json.load(file) + l = len(object) + date_list = [] + for i in range(0, l): + obj = object[i]["createTime"] + if eligibility_check(obj): + dt_obj = datetime.datetime.fromtimestamp(obj) + date_list.append(dt_obj.date()) + else: + print(f'ERROR: Some error occured. Check {obj}.') + ordered = dict(list_to_frequency(date_list)) + dates = list(ordered.keys()) + total_dates = len(dates) + frequency = list(ordered.values()) + plt.scatter(dates, frequency) + plt.gcf().autofmt_xdate() + date_format = mdates.DateFormatter('%d-%m-%Y') + plt.gca().xaxis.set_major_formatter(date_format) + plt.tight_layout() + plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}') + plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)') + plt.ylabel('Posts') + plt.show() From 287fab2e67e4bb2e91610f4d3fcad9c8da01c829 Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Tue, 20 Jul 2021 22:26:33 +0200 Subject: [PATCH 02/23] Create README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..301f450 --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# TikTok_plotter +1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper +2. Use the following command: python3 extract_date.py target_file.json hashtag + + +The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file. From 5646067778eed3c3c519142fa6969c979569e68f Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Tue, 10 Aug 2021 19:06:23 +0200 Subject: [PATCH 03/23] Add files via upload --- extract_hashtag.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 extract_hashtag.py diff --git a/extract_hashtag.py b/extract_hashtag.py new file mode 100644 index 0000000..380c174 --- /dev/null +++ b/extract_hashtag.py @@ -0,0 +1,71 @@ +import os, sys +import csv, json +import matplotlib.pyplot as plt +from collections import Counter, OrderedDict + + +def get_hashtag_list(obj): + if not obj: + print(f'ERROR: Empty item, no hashtags to be extracted.') + return + else: + hashtag_list = [] + length = len(obj) + for i in range(length): + for hashtag in obj[i]['hashtags']: + hashtag_list.append(hashtag['name']) + return hashtag_list + + +def create_csv(file_name, d): + base = os.path.splitext(file_name)[0] + path = f"./{base}_sorted_hashtags.csv" + if os.path.exists(path): + print(f'The file {path} containing hashtag occurances already exists.') + return None + else: + with open(path, "w") as f: + f.write(f"Name, Occurances" + "\n") + for key,value in d.items(): + f.write(f"{key}, {value}" + "\n") + print(f'The sorted hashtag occcurances list is contained in the file {path}.') + return None + + +def plot_hashtag_occurances(file_name, plots): + with open(file_name) as f: + obj = json.load(f) + length = len(obj) + hashtag_list = get_hashtag_list(obj) + hashtags = Counter(hashtag_list).most_common() + hashtags_sorted = {k:v for (k,v) in hashtags} + create_csv(file_name, hashtags_sorted) + k = list(hashtags_sorted.keys()) + v = list(hashtags_sorted.values()) + k = k[:plots] + v = v[:plots] + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {plots} hashtags from {length} posts.') + plt.ylabel(f'Number of occurances') + plt.show() + return + + + +if len(sys.argv) != 3: + print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') + sys.exit() +else: + try: + int(sys.argv[2]) + except: + print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') + raise + + try: + plot_hashtag_occurances(sys.argv[1], int(sys.argv[2])) + except: + print("Unexpected error:", sys.exc_info()[0]) + raise From db74373c3d0e5559f7afc79d2e90c7b4d683aa78 Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Tue, 10 Aug 2021 19:09:00 +0200 Subject: [PATCH 04/23] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 301f450..93d90b4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # TikTok_plotter +## extract_date.py 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper 2. Use the following command: python3 extract_date.py target_file.json hashtag The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file. + +## extract_hashtag.py +1. Use the following command: python3 extract_hashtag.py target_file.json n +2. It will plot top n hashtag frequencies. From 29e87b75c338b2683be53aac4bcb650836694977 Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Wed, 11 Aug 2021 18:34:46 +0200 Subject: [PATCH 05/23] Add files via upload --- extract_hashtag.py | 110 +++++++++++++++++++++++---------------------- extract_posts.py | 64 ++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 53 deletions(-) create mode 100644 extract_posts.py diff --git a/extract_hashtag.py b/extract_hashtag.py index 380c174..2769e3d 100644 --- a/extract_hashtag.py +++ b/extract_hashtag.py @@ -1,71 +1,75 @@ import os, sys import csv, json import matplotlib.pyplot as plt -from collections import Counter, OrderedDict -def get_hashtag_list(obj): + +def get_hashtags(obj): if not obj: print(f'ERROR: Empty item, no hashtags to be extracted.') return else: - hashtag_list = [] - length = len(obj) - for i in range(length): + hashtags = {} + l = len(obj) + for i in range(l): for hashtag in obj[i]['hashtags']: - hashtag_list.append(hashtag['name']) - return hashtag_list + if hashtag['name'] in hashtags: + hashtags[hashtag['name']].add(i) + else: + hashtags[hashtag['name']] = {i} + return hashtags -def create_csv(file_name, d): +def create_csv(file_name, path, d): + with open(path, "w") as f: + f.write(f"Name, Occurances, Positions" + "\n") + for key,value in d.items(): + f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n") + print(f'The sorted hashtag occcurances list is contained in the file {path}.') + return None + + +def plot_occurances(file_name, plots): base = os.path.splitext(file_name)[0] path = f"./{base}_sorted_hashtags.csv" if os.path.exists(path): - print(f'The file {path} containing hashtag occurances already exists.') - return None + print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.') + return else: - with open(path, "w") as f: - f.write(f"Name, Occurances" + "\n") - for key,value in d.items(): - f.write(f"{key}, {value}" + "\n") - print(f'The sorted hashtag occcurances list is contained in the file {path}.') - return None + with open(file_name) as f: + obj = json.load(f) + l = len(obj) + tags = get_hashtags(obj) + tags = {key: (len(value), value) for (key, value) in tags.items()} + sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} + create_csv(file_name, path, sorted_tags) + k = list(sorted_tags.keys()) + v = list(sorted_tags.values()) + v = [i[0] for i in v] + k = k[:plots] + v = v[:plots] + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {plots} hashtags from {l} posts.') + plt.ylabel(f'Number of occurances') + plt.show() + return -def plot_hashtag_occurances(file_name, plots): - with open(file_name) as f: - obj = json.load(f) - length = len(obj) - hashtag_list = get_hashtag_list(obj) - hashtags = Counter(hashtag_list).most_common() - hashtags_sorted = {k:v for (k,v) in hashtags} - create_csv(file_name, hashtags_sorted) - k = list(hashtags_sorted.keys()) - v = list(hashtags_sorted.values()) - k = k[:plots] - v = v[:plots] - plt.scatter(k, v) - plt.tight_layout() - plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {plots} hashtags from {length} posts.') - plt.ylabel(f'Number of occurances') - plt.show() - return - - - -if len(sys.argv) != 3: - print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') - sys.exit() -else: - try: - int(sys.argv[2]) - except: - print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') - raise - - try: - plot_hashtag_occurances(sys.argv[1], int(sys.argv[2])) - except: - print("Unexpected error:", sys.exc_info()[0]) - raise +if __name__ == "__main__": + if len(sys.argv) != 3: + print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') + sys.exit() + else: + try: + int(sys.argv[2]) + except: + print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') + raise + + try: + plot_occurances(sys.argv[1], int(sys.argv[2])) + except: + print("Unexpected error:", sys.exc_info()[0]) + raise diff --git a/extract_posts.py b/extract_posts.py new file mode 100644 index 0000000..c4bb435 --- /dev/null +++ b/extract_posts.py @@ -0,0 +1,64 @@ +import os, sys +import csv, json +import re +from pandas import * + +def arg_check(): + if len(sys.argv) != 3: + print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag') + sys.exit() + else: + return + +def get_hashtag_positions(file_name, hashtag): + base = os.path.splitext(file_name)[0] + path = f"./{base}_sorted_hashtags.csv" + if not os.path.exists(path): + print(f'Generating {path} ...') + os.system(f'python3 extract_hashtag.py {file_name} {1}') + + return tag_membership(hashtag, path) + + +def tag_membership(hashtag, path): + data = read_csv(path) + position_str = list(data[data["Name"] == hashtag].values[:, 2]) + if position_str: + position_str = re.split('{|}', str(position_str))[1] + p = position_str.replace(";", ",") + positions = [int(s) for s in p.split(",")] + return positions + else: + return + + +def print_posts(file_name, path, hashtag, positions): + with open(file_name) as f: + data = json.load(f) + posts = [] + for p in positions: + posts.append(data[p]) + keys = posts[0].keys() + with open(path, 'w', newline='') as csv_file: + writer = csv.DictWriter(csv_file, keys) + writer.writeheader() + writer.writerows(posts) + print(f'The posts are contained in the file {path}.') + return + + +if __name__ == "__main__": + arg_check() + file_name = sys.argv[1] + hashtag = sys.argv[2] + path = f"./{hashtag}_posts.csv" + if os.path.exists(path): + print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.') + sys.exit() + else: + positions = get_hashtag_positions(file_name, hashtag) + if positions: + print_posts(file_name, path, hashtag, positions) + else: + print(f'{hashtag} not found!!!!') + sys.exit() From 28b6ede51f68f2ead2aad530708cab31224ccc40 Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Wed, 11 Aug 2021 18:39:10 +0200 Subject: [PATCH 06/23] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 93d90b4..e6aa515 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ # TikTok_plotter +The project provides tools to analyze hashtags within posts scraped from TikTok. + ## extract_date.py 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper 2. Use the following command: python3 extract_date.py target_file.json hashtag @@ -9,3 +11,6 @@ The command in point 2 uses the extract_date.py script to extract the dates and ## extract_hashtag.py 1. Use the following command: python3 extract_hashtag.py target_file.json n 2. It will plot top n hashtag frequencies. + +## extract_posts.py +1. Use the following command: python3 extract_posts.py target_file.json hashtag From 2d6adc0028309b5687bdfc147d36be1955c920cf Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Wed, 11 Aug 2021 18:59:39 +0200 Subject: [PATCH 07/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e6aa515..111e944 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# TikTok_plotter +# tiktok hashtag analysis toolset The project provides tools to analyze hashtags within posts scraped from TikTok. ## extract_date.py From 30f9bd9b2776d04af6118bd0f0a7b99b98519cba Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Wed, 11 Aug 2021 19:00:47 +0200 Subject: [PATCH 08/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 111e944..a5febe0 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The command in point 2 uses the extract_date.py script to extract the dates and ## extract_hashtag.py 1. Use the following command: python3 extract_hashtag.py target_file.json n -2. It will plot top n hashtag frequencies. +2. It will plot top n hashtag frequencies. Recommendation n < = 10 for easy to read and analyze. ## extract_posts.py 1. Use the following command: python3 extract_posts.py target_file.json hashtag From f3172f6d1c3002079afc452fb8443537fbf4af75 Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Wed, 11 Aug 2021 19:10:04 +0200 Subject: [PATCH 09/23] Update README.md --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a5febe0..3baf449 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # tiktok hashtag analysis toolset -The project provides tools to analyze hashtags within posts scraped from TikTok. +The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper). + +## Pre-conditions +1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper ## extract_date.py -1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper -2. Use the following command: python3 extract_date.py target_file.json hashtag - +1. Use the following command: python3 extract_date.py target_file.json hashtag_name The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file. @@ -13,4 +14,4 @@ The command in point 2 uses the extract_date.py script to extract the dates and 2. It will plot top n hashtag frequencies. Recommendation n < = 10 for easy to read and analyze. ## extract_posts.py -1. Use the following command: python3 extract_posts.py target_file.json hashtag +1. Use the following command: python3 extract_posts.py target_file.json hashtag_name From 91b68cb54ec0fd1d81729cd29b4da6cc078bcff9 Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Wed, 11 Aug 2021 19:13:57 +0200 Subject: [PATCH 10/23] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3baf449..19155a0 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ The project provides tools to analyze hashtags based on data downloaded using ti ## extract_date.py 1. Use the following command: python3 extract_date.py target_file.json hashtag_name - -The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file. +2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file. ## extract_hashtag.py 1. Use the following command: python3 extract_hashtag.py target_file.json n -2. It will plot top n hashtag frequencies. Recommendation n < = 10 for easy to read and analyze. +2. The command above will plot top n hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze. ## extract_posts.py 1. Use the following command: python3 extract_posts.py target_file.json hashtag_name +2. The command above pulls out all the posts for the hashtag hashtag_name from the downloaded tiktok scraper data. From 1172aa1792e12aebe7703f337b87d9fcd2be3234 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:04:10 +0200 Subject: [PATCH 11/23] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 19155a0..1d9a6b6 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti ## Pre-conditions 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper +2. Download posts relating to a hashtag in json format. Example: tiktok-scraper tokyo2021 -t 'json' ## extract_date.py 1. Use the following command: python3 extract_date.py target_file.json hashtag_name From 39ae6ff2d2b670946b703c9e4ba8e548425e1a64 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:05:35 +0200 Subject: [PATCH 12/23] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d9a6b6..7686432 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,11 @@ The project provides tools to analyze hashtags based on data downloaded using ti ## Pre-conditions 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper -2. Download posts relating to a hashtag in json format. Example: tiktok-scraper tokyo2021 -t 'json' +2. Download posts relating to a hashtag in **json** format. Example: tiktok-scraper tokyo2021 -t 'json' ## extract_date.py 1. Use the following command: python3 extract_date.py target_file.json hashtag_name -2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file. +2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the '.json' file. ## extract_hashtag.py 1. Use the following command: python3 extract_hashtag.py target_file.json n From 2d32677dc9a17afe416ee1219efb46dd60e24520 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:05:54 +0200 Subject: [PATCH 13/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7686432..401207e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti ## Pre-conditions 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper -2. Download posts relating to a hashtag in **json** format. Example: tiktok-scraper tokyo2021 -t 'json' +2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper tokyo2021 -t 'json' ## extract_date.py 1. Use the following command: python3 extract_date.py target_file.json hashtag_name From 64568058f36392f04de2360aca578bb08e97abf2 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:06:27 +0200 Subject: [PATCH 14/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 401207e..464e3ef 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti ## Pre-conditions 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper -2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper tokyo2021 -t 'json' +2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper hashtag tokyo2021 -t 'json' ## extract_date.py 1. Use the following command: python3 extract_date.py target_file.json hashtag_name From f4f66bfd21f6ca3afc0c1fc4a8f76d9dde1833e9 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:06:56 +0200 Subject: [PATCH 15/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 464e3ef..4727506 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti ## extract_hashtag.py 1. Use the following command: python3 extract_hashtag.py target_file.json n -2. The command above will plot top n hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze. +2. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze. ## extract_posts.py 1. Use the following command: python3 extract_posts.py target_file.json hashtag_name From b429c4a88abcc8a712a00d34fbd5ceb4070827d5 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:07:29 +0200 Subject: [PATCH 16/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4727506..2461537 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # tiktok hashtag analysis toolset The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper). -## Pre-conditions +## Pre-requisites 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper 2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper hashtag tokyo2021 -t 'json' From d6d325b31a7470d1fc83f20068b81756b3e818ef Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:08:47 +0200 Subject: [PATCH 17/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2461537..213f933 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# tiktok hashtag analysis toolset +# TikTok hashtag analysis toolset The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper). ## Pre-requisites From 37998dd7769fa8469a66d1bbaac764e38e793158 Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Mon, 25 Oct 2021 13:52:46 +0200 Subject: [PATCH 18/23] Add files via upload --- extract_hashtag.py | 104 +++++++++++++++++++++----------------- extract_posts.py | 85 +++++++++++++------------------ top_hashtag_occurances.py | 83 ++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 96 deletions(-) create mode 100644 top_hashtag_occurances.py diff --git a/extract_hashtag.py b/extract_hashtag.py index 2769e3d..0b9e1fc 100644 --- a/extract_hashtag.py +++ b/extract_hashtag.py @@ -1,5 +1,6 @@ import os, sys import csv, json +import argparse import matplotlib.pyplot as plt @@ -20,56 +21,67 @@ def get_hashtags(obj): return hashtags -def create_csv(file_name, path, d): - with open(path, "w") as f: - f.write(f"Name, Occurances, Positions" + "\n") - for key,value in d.items(): - f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n") - print(f'The sorted hashtag occcurances list is contained in the file {path}.') - return None - - -def plot_occurances(file_name, plots): - base = os.path.splitext(file_name)[0] - path = f"./{base}_sorted_hashtags.csv" - if os.path.exists(path): - print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.') - return - else: - with open(file_name) as f: - obj = json.load(f) - l = len(obj) - tags = get_hashtags(obj) - tags = {key: (len(value), value) for (key, value) in tags.items()} +def get_occurances(filename, n=1 , sort=True): + with open(filename) as f: + obj = json.load(f) + l = len(obj) + tags = get_hashtags(obj) + tags = {key: (len(value), value) for (key, value) in tags.items()} + if not sort: + k = list(tags.keys()) + v = list(tags.values()) + return obj, k, v + else: sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} - create_csv(file_name, path, sorted_tags) k = list(sorted_tags.keys()) v = list(sorted_tags.values()) - v = [i[0] for i in v] - k = k[:plots] - v = v[:plots] - plt.scatter(k, v) - plt.tight_layout() - plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {plots} hashtags from {l} posts.') - plt.ylabel(f'Number of occurances') - plt.show() - return + k = k[:n] + v_total = [i[0] for i in v] + v_total = v_total[:n] + return l, k, v_total + + + +def plot(n, length, k, v): + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {n} hashtags from {length} posts.') + plt.ylabel(f'Number of occurances') + plt.show() + return + + +def print_occurances(k, v): + row_number = 0 + print(f'Hashtag Occurances') + for key,value in zip(k, v): + print(f'{row_number}\t{key}\t\t{value}') + row_number += 1 + return + if __name__ == "__main__": - if len(sys.argv) != 3: - print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.') - sys.exit() + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="The json hashtag file name") + parser.add_argument("n", help="The number of top n occurances", type=int) + parser.add_argument("-p", "--plot", help="Plot the occurances", action="store_true") + parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true") + args = parser.parse_args() + if args.input_file and args.n: + if args.n < 1: + print(f"Please make sure the number of top occurances is a positive integer.") + sys.exit() + + base = os.path.splitext(args.input_file)[0] + path = f"./{base}_sorted_hashtags.csv" + if args.plot: + length, keys, values = get_occurances(args.input_file, args.n) + plot(args.n, length, keys, values) + else: + length, keys, values = get_occurances(args.input_file, args.n) + print_occurances(keys, values) else: - try: - int(sys.argv[2]) - except: - print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.') - raise - - try: - plot_occurances(sys.argv[1], int(sys.argv[2])) - except: - print("Unexpected error:", sys.exc_info()[0]) - raise + print(f'ERROR: either {args.input_file} or {args.n} or both contains error.') + diff --git a/extract_posts.py b/extract_posts.py index c4bb435..721393b 100644 --- a/extract_posts.py +++ b/extract_posts.py @@ -1,64 +1,49 @@ import os, sys -import csv, json -import re -from pandas import * - -def arg_check(): - if len(sys.argv) != 3: - print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag') - sys.exit() - else: - return - -def get_hashtag_positions(file_name, hashtag): - base = os.path.splitext(file_name)[0] - path = f"./{base}_sorted_hashtags.csv" - if not os.path.exists(path): - print(f'Generating {path} ...') - os.system(f'python3 extract_hashtag.py {file_name} {1}') - - return tag_membership(hashtag, path) +from extract_hashtag import get_occurances -def tag_membership(hashtag, path): - data = read_csv(path) - position_str = list(data[data["Name"] == hashtag].values[:, 2]) - if position_str: - position_str = re.split('{|}', str(position_str))[1] - p = position_str.replace(";", ",") - positions = [int(s) for s in p.split(",")] - return positions - else: - return +def filter_positions(hashtags, keys, positions): + filtered = [] + for hashtag in hashtags: + try: + i = keys.index(hashtag) + key = keys[i] + post_indices = positions[i][1] + filtered.append((key, post_indices)) + except Exception as error: + print(error) + continue + return filtered -def print_posts(file_name, path, hashtag, positions): - with open(file_name) as f: - data = json.load(f) - posts = [] - for p in positions: - posts.append(data[p]) - keys = posts[0].keys() - with open(path, 'w', newline='') as csv_file: - writer = csv.DictWriter(csv_file, keys) - writer.writeheader() - writer.writerows(posts) - print(f'The posts are contained in the file {path}.') - return +def write_posts(path, obj, filtered): + length = len(filtered) + with open(path, "w") as output_file: + for i in range(length): + hashtag = filtered[i][0] + total_positions = len(filtered[i][1]) + positions = list(filtered[i][1]) + first_position = positions[0] + output_file.write(f"{hashtag}, {obj[first_position]}" + "\n") + for p in range(1, total_positions): + output_file.write(f" , {obj[positions[p]]}" + "\n") + print(f"{total_positions} posts written for the hashtag - {hashtag}") if __name__ == "__main__": - arg_check() file_name = sys.argv[1] - hashtag = sys.argv[2] - path = f"./{hashtag}_posts.csv" + hashtags = list(sys.argv[2:]) + name = f"{hashtags[0]}_{len(hashtags)}" + path = f"../{name}_posts.csv" if os.path.exists(path): print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.') sys.exit() else: - positions = get_hashtag_positions(file_name, hashtag) - if positions: - print_posts(file_name, path, hashtag, positions) + obj, keys, positions = get_occurances(file_name, sort=False) + filtered = filter_positions(hashtags, keys, positions) + if filtered: + write_posts(path, obj, filtered) else: - print(f'{hashtag} not found!!!!') - sys.exit() + print(f"No posts found for the hashtags you entered.") + + diff --git a/top_hashtag_occurances.py b/top_hashtag_occurances.py new file mode 100644 index 0000000..dcee5aa --- /dev/null +++ b/top_hashtag_occurances.py @@ -0,0 +1,83 @@ +#!/usr/bin/python3 + +import os, time +import json +import argparse +from datetime import datetime + + +def parser(): + parser = argparse.ArgumentParser() + parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+") + parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int) + args = parser.parse_args() + return args + + +def check_file_existence(hashtag, contains=None): + pwd = "./" + for i in os.listdir(pwd): + #if os.path.isfile(os.path.join(pwd, i)) and hashtag in i: + if hashtag in i and contains in i: + return i + elif hashtag in i: + return i + else: + continue + return + + +def get_input_file(hashtag): + check_file = check_file_existence(hashtag, "json") + if check_file: + return check_file + else: + try: + os.system(f"tiktok-scraper hashtag {hashtag} -t json") + c = check_file_existence(hashtag, "json") + if c: + return c + else: + print(f"ERROR: No json file relating to {hashtag} found.") + except: + raise + + +def copy_data(input_file, output_file): + os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}") + return + + +def get_data(hashtag, n): + input_file = get_input_file(hashtag) + if input_file: + os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o") + base = os.path.splitext(input_file)[0] + data_file = f"{base}_sorted_hashtags.csv" + if os.path.exists(data_file): + return data_file + return + + +def get_occurances(hashtag, n, output): + data_file = get_data(hashtag, n) + copy_data(data_file, output) + os.system(f"rm {data_file}") + print(f"{data_file} removed ....") + + +if __name__ == "__main__": + args = parser() + hashtags = args.hashtags + now = datetime.now().strftime("%d%m%Y-%H%M%S") + output = f"./{now}.csv" + l = len(hashtags) + if l > 1: + sleep = 30 # Sleep time (in secs) between two tiktok scraping requests. + get_occurances(hashtags[0], args.top_n, output) + for i in range(1, l): + time.sleep(30) + get_occurances(hashtags[i], args.top_n, output) + else: + get_occurances(hashtags[0], args.top_n, output) + print(f"The output data is stored in the file {output}") From 8f9427a5f81f24725653b457e8e16d23e64ae3bd Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Mon, 25 Oct 2021 14:16:18 +0200 Subject: [PATCH 19/23] Update README.md --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 213f933..fce7f73 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,10 @@ The project provides tools to analyze hashtags based on data downloaded using ti 2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the '.json' file. ## extract_hashtag.py -1. Use the following command: python3 extract_hashtag.py target_file.json n -2. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze. +1. Use the following command to print the result on the screen: python3 extract_hashtag.py target_file.json n -d +2. Use the following command to plot: python3 extract_hashtag.py target_file.json n -p +3. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze. ## extract_posts.py -1. Use the following command: python3 extract_posts.py target_file.json hashtag_name -2. The command above pulls out all the posts for the hashtag hashtag_name from the downloaded tiktok scraper data. +1. Use the following command: python3 extract_posts.py target_file.json hashtag_names +2. The command above pulls out all the posts for the hashtag hashtag_names (enter multiple names with space) from the downloaded tiktok scraper data. From 2d3f4a9aab2a1839a55e9a7c89166e6fad8a107a Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Mon, 25 Oct 2021 14:16:33 +0200 Subject: [PATCH 20/23] Create README.md corrections From 2a34e03dc83249c46e8908fc02f785cc13ea2de2 Mon Sep 17 00:00:00 2001 From: X Date: Sun, 30 Jan 2022 13:51:08 +0100 Subject: [PATCH 21/23] rebase --- analytics/hashtag_frequencies.py | 90 ++++++++++ analytics/logging_analytics.py | 4 + tiktok_downloader/data_methods.py | 123 +++++++++++++ tiktok_downloader/file_methods.py | 201 +++++++++++++++++++++ tiktok_downloader/global_data.py | 38 ++++ tiktok_downloader/hashtag_list.py | 37 ++++ tiktok_downloader/hashtag_list_sample.py | 8 + tiktok_downloader/run_downloader.py | 212 +++++++++++++++++++++++ 8 files changed, 713 insertions(+) create mode 100644 analytics/hashtag_frequencies.py create mode 100644 analytics/logging_analytics.py create mode 100644 tiktok_downloader/data_methods.py create mode 100644 tiktok_downloader/file_methods.py create mode 100644 tiktok_downloader/global_data.py create mode 100644 tiktok_downloader/hashtag_list.py create mode 100644 tiktok_downloader/hashtag_list_sample.py create mode 100644 tiktok_downloader/run_downloader.py diff --git a/analytics/hashtag_frequencies.py b/analytics/hashtag_frequencies.py new file mode 100644 index 0000000..3afd405 --- /dev/null +++ b/analytics/hashtag_frequencies.py @@ -0,0 +1,90 @@ +import os, sys +import csv, json +import argparse +import matplotlib.pyplot as plt + + + +def get_hashtags(obj): + if not obj: + print(f'ERROR: Empty item, no hashtags to be extracted.') + return + else: + hashtags = {} + l = len(obj) + for i in range(l): + for hashtag in obj[i]['hashtags']: + if hashtag['name'] in hashtags: + hashtags[hashtag['name']].add(i) + else: + hashtags[hashtag['name']] = {i} + return hashtags + + +def get_occurrences(filename, n=1 , sort=True): + with open(filename) as f: + obj = json.load(f) + l = len(obj) + tags = get_hashtags(obj) + tags = {key: (len(value), value) for (key, value) in tags.items()} + if not sort: + k = list(tags.keys()) + v = list(tags.values()) + return obj, k, v + else: + sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} + k = list(sorted_tags.keys()) + v = list(sorted_tags.values()) + k = k[:n] + v_total = [i[0] for i in v] + v_total = v_total[:n] + return l, k, v_total + + + +def plot(n, length, k, v): + plt.scatter(k, v) + plt.tight_layout() + plt.title(f'Hashtag Distribution') + plt.xlabel(f'Top {n} hashtags from {length} posts.') + plt.ylabel(f'Number of occurrences') + plt.show() + return + + +def print_occurrences(l, k, v): + row_number = 0 + total_posts = l + print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))')) + #print(f'Hashtag Occurrences Frequency(Occurances/Total-Posts)') + for key,value in zip(k, v): + ratio = value/total_posts + print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio)) + #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}') + row_number += 1 + return + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="The json hashtag file name") + parser.add_argument("n", help="The number of top n occurrences", type=int) + parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true") + parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true") + args = parser.parse_args() + if args.input_file and args.n: + if args.n < 1: + print(f"Please make sure the number of top occurrences is a positive integer.") + sys.exit() + + base = os.path.splitext(args.input_file)[0] + path = f"./{base}_sorted_hashtags.csv" + if args.plot: + length, keys, values = get_occurrences(args.input_file, args.n) + plot(args.n, length, keys, values) + else: + length, keys, values = get_occurrences(args.input_file, args.n) + print_occurrences(length, keys, values) + else: + print(f'ERROR: either {args.input_file} or {args.n} or both contains error.') diff --git a/analytics/logging_analytics.py b/analytics/logging_analytics.py new file mode 100644 index 0000000..cba8ca5 --- /dev/null +++ b/analytics/logging_analytics.py @@ -0,0 +1,4 @@ +""" +Yet to be written ... +""" + diff --git a/tiktok_downloader/data_methods.py b/tiktok_downloader/data_methods.py new file mode 100644 index 0000000..c35e2a4 --- /dev/null +++ b/tiktok_downloader/data_methods.py @@ -0,0 +1,123 @@ +import os +from collections import namedtuple +from datetime import datetime +import global_data +import file_methods + + +Difference = namedtuple("Difference", "new_ids size") +Total = namedtuple("Total", "total unique") + + +def get_difference(tag, file, ids): + maiden_entry = False + current_id_data = file_methods.get_data(file) + if tag in current_id_data: + current_ids = current_id_data[tag] + set1 = set(current_ids) + set2 = set(ids) + new_ids = set2.difference(set1) + if new_ids: + new_ids = list(new_ids) + size = len(new_ids) + diff = Difference(new_ids, size) + return (diff, maiden_entry) + else: + return ([], maiden_entry) + else: + maiden_entry = True + return (ids, maiden_entry) + + +def extract_posts(settings, file_name, tag): + ids = [] + posts = [] + new_posts = [] + + posts = file_methods.get_data(file_name) + for post in posts: + ids.append(post["id"]) + if not ids: + print(f"WARNING: no posts were found for {tag} in the file - {file_name}") + return + + status = file_methods.check_existence(settings["post_ids"], "file") + if not status: + new_data = (ids, posts) + return new_data + else: + res = get_difference(tag, settings["post_ids"], ids) + if res[1]: + new_data = (ids, posts) + return new_data + else: + if res[0]: + for i in res[0].new_ids: + for post in posts: + if (i == post["id"]): + new_posts.append(post) + new_data = (res[0].new_ids, new_posts) + return new_data + else: + print(f"WARNING: No new posts were found in the downloaded file - {file_name}") + return + + +def extract_videos(settings, tag, download_list): + status = file_methods.check_existence(settings["video_ids"], "file") + if not status: + new_data = download_list + return new_data + else: + res = get_difference(tag, settings["video_ids"], download_list) + if res[1]: + return download_list + else: + if res[0]: + new_data = res[0].new_ids + return new_data + else: + print(f"WARNING: No new videos were found for the {tag} in the downloaded folder.") + return + + +def update_posts(file_path, file_type, new_data, tag=None): + try: + status = file_methods.check_existence(file_path, file_type) + if not tag: + file_methods.post_writer(file_path, new_data, status) + else: + log = file_methods.id_writer(file_path, new_data, tag, status) + return log + except: + raise + + +def update_videos(settings, new_data, tag): + file_path = settings["video_ids"] + file_methods.check_file(file_path, "file") + log = file_methods.id_writer(file_path, new_data, tag, True) + file_methods.clean_video_files(settings, tag, new_data) + return log + + +def get_total_posts(file_path, tag): + status = file_methods.check_existence(file_path, "file") + if not status: + raise OSError("{file_path} not found!") + else: + data = file_methods.get_data(file_path) + total = len(data[tag]) + unique = len(set(data[tag])) + total = Total(total, unique) + return total + + +def print_total(file_path, tag, data_type): + total = get_total_posts(file_path, tag) + if (total.total == total.unique): + print(f"Total {data_type} for the hashtag {tag} are: {total.total}") + return + else: + print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...") + return diff --git a/tiktok_downloader/file_methods.py b/tiktok_downloader/file_methods.py new file mode 100644 index 0000000..8842f07 --- /dev/null +++ b/tiktok_downloader/file_methods.py @@ -0,0 +1,201 @@ +import os, json, subprocess +from datetime import datetime +import global_data +import data_methods + + +def create_file(name, file_type): + if (file_type == "dir"): + os.makedirs(name, mode=0o777) + elif (file_type == "file"): + with open(name, "w"): pass + else: + print(f"ERROR: either {file_type} or is not well defined.") + return + + +def check_existence(file_path, file_type): + if (file_type == "file"): + if os.path.isfile(file_path): + return True + else: + return False + elif (file_type == "dir"): + if os.path.isdir(file_path): + return True + else: + return False + else: + raise OSError(f"{file_type} has to be a 'dir' or a 'file'!!!") + + +def check_file(file_path, file_type): + status = check_existence(file_path, file_type) + if not status: + create_file(file_path, file_type) + + return + + +def download_posts(settings, tag): + path = os.path.join(settings["data"], tag, settings["posts"]) + os.chdir(path) + try: + tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" + result = subprocess.run([tiktok_command], capture_output=True, shell=True) + if result.stdout: + new_file = result.stdout.decode('utf-8').split()[-1] + if ("json" in new_file): + os.chdir("../../../tiktok_downloader") + return new_file + else: + print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!") + os.chdir("../../../tiktok_downloader") + return + else: + os.chdir("../../../tiktok_downloader") + print(f"ERROR: No file was downloaded by the tiktok-scraper for the {tag} !!!!") + return + except: raise + + + +def download_videos(settings, tag): + path = os.path.join(settings["data"], tag, settings["videos"]) + os.chdir(path) + try: + tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" + result = subprocess.run([tiktok_command], capture_output=True, shell=True) + if result.stdout: + downloaded_list_tmp = os.listdir(f"./#{tag}") + if downloaded_list_tmp: + downloaded_list = [] + for file in downloaded_list_tmp: + file = file[0:-4] + downloaded_list.append(file) + + os.chdir("../../../tiktok_downloader") + return downloaded_list + else: + print(f"WARNING: No video files were downloaded for the hashtag {tag}.") + os.chdir("../../../tiktok_downloader") + subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True) + else: + os.chdir("../../../tiktok_downloader") + print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!") + return + + except: raise + + +def get_data(file_path): + with open(file_path, "r") as f: + data = json.load(f) + return data + + +def dump_data(file_path, data): + with open(file_path, "w") as f: + json.dump(data, f) + return + +def log_writer(log_data): + total = 0 + try: + log_dict = {} + for ele in log_data: + if ele[0] in log_dict: + if ele[1][0] in log_dict[ele[0]]: + log_dict[ele[0]][ele[1][0]] += ele[1][1] + else: + log_dict[ele[0]][ele[1][0]] = ele[1][1] + total += ele[1][1] + else: + log_dict[ele[0]] = { ele[1][0] : ele[1][1] } + total += ele[1][1] + + logger = global_data.FILES["logger"] + now = datetime.now() + now_str = now.strftime("%d-%m-%Y %H:%M:%S") + status = check_existence(logger, "file") + if status: + data = get_data(logger) + data[now_str] = log_dict + dump_data(logger, data) + else: + data = { now_str : log_dict } + dump_data(logger, data) + print(f"Successfully logged {total} entries!!!!") + return + except: raise + + +def id_writer(file_path, new_data, tag, status): + try: + total = len(new_data) + if status: + try: + data = get_data(file_path) + if tag in data: + data[tag] += new_data + else: + data[tag]= new_data + dump_data(file_path, data) + except json.decoder.JSONDecodeError: + data = { tag : new_data } + dump_data(file_path, data) + else: + data = { tag : new_data } + dump_data(file_path, data) + print(f"SUCCESS - {total} entries added to {file_path}!!!") + log_data = (tag, total) + return log_data + except: raise + + +def post_writer(file_path, new_data, status): + try: + total = len(new_data) + if status: + try: + data = get_data(file_path) + data += new_data + dump_data(file_path, data) + except json.decoder.JSONDecodeError: + data = new_data + dump_data(file_path, data) + else: + data = new_data + dump_data(file_path, data) + print(f"SUCCESS - {total} entries added to {file_path}!!!") + return + except: raise + + +def delete_file(file_path, file_type): + if not check_existence(file_path, file_type): + print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!") + elif (file_type == "file"): + os.remove(file_path) + print(f"Successfully deleted {file_path}!!!") + return + elif (file_type == "dir"): + os.rmdir(file_path) + print(f"Successfully deleted {file_path}!!!") + return + else: + print(f"ERROR: {file_type} needs to be either 'file' or 'dir' !!!") + return + + +def clean_video_files(settings, tag, new_data=None): + try: + if new_data: + for file in new_data: + settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4" + subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True) + + subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True) + print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.") + except: + raise diff --git a/tiktok_downloader/global_data.py b/tiktok_downloader/global_data.py new file mode 100644 index 0000000..b83df45 --- /dev/null +++ b/tiktok_downloader/global_data.py @@ -0,0 +1,38 @@ +# Directories +DATA = "../data" +IDS = "ids" +LOG = "log" +POSTS = "posts" +VIDEOS = "videos" + +# Files +POST_IDS = "post_ids.json" +VIDEO_IDS = "video_ids.json" +DATA_FILE = "data.json" +LOG_FILE = "log.json" + + +FILES = { + "data" : DATA, + "ids" : IDS, + "log" : LOG, + "posts" : POSTS, + "videos" : VIDEOS, + "post_ids" : f"{DATA}/{IDS}/{POST_IDS}", + "video_ids" : f"{DATA}/{IDS}/{VIDEO_IDS}", + "data_file" : f"{DATA_FILE}", + "downloads" : [], + "logger" : f"{DATA}/{LOG}/{LOG_FILE}", + } + + + +# Commands +tag = "" + +COMMANDS = { + "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper. + "post_download" : f"tiktok-scraper hashtag {tag} -t 'json'", + "video_download" : f"tiktok-scraper hashtag {tag} -d", + "sleep" : 8 + } diff --git a/tiktok_downloader/hashtag_list.py b/tiktok_downloader/hashtag_list.py new file mode 100644 index 0000000..e595523 --- /dev/null +++ b/tiktok_downloader/hashtag_list.py @@ -0,0 +1,37 @@ +hashtag_list = [ +# This is a sample hashtag list. Please enter your hashtag list (without the comment). +# "london", +# "paris", +# "newyork", +# "tokyo" + "uyghur", + "uyghur2021", + "uyghur2022", + "uyghurmuslims", + "xinjiang", + "xinjiangchina", + "xinjiangcotton", + "xinjiangtravel", + "uyghurlivesmatter", + "uighur", + "Uighurs", + "Uyghurs", + "uighuren", + "saveuyghur", + "uighurmuslims", + "chinesemuslim", + "uyghurpeople", + "urumqi", + "chinaxinjiang", + "xinjianguyghurs", + "eastturkestan", + "chinaconcentrationcamp", + "xinjianguyghur🇨🇳", + "kashgar", + "xinjiangreeducationcamps", + "uyghur_tiktok", + "uyghurreality", + "xinjiangdance", + "westernmedia", + "uyghurgenocide" + ] diff --git a/tiktok_downloader/hashtag_list_sample.py b/tiktok_downloader/hashtag_list_sample.py new file mode 100644 index 0000000..4ddff1a --- /dev/null +++ b/tiktok_downloader/hashtag_list_sample.py @@ -0,0 +1,8 @@ +hashtag_list = [ +# This is a sample hashtag list. Please enter your hashtag list (without the comment). + "london", + "paris", + "newyork", + "tokyo" + + ] diff --git a/tiktok_downloader/run_downloader.py b/tiktok_downloader/run_downloader.py new file mode 100644 index 0000000..d4ccffe --- /dev/null +++ b/tiktok_downloader/run_downloader.py @@ -0,0 +1,212 @@ +import os, sys +import time +import json +import argparse + +import global_data +import file_methods +import data_methods + + + +command = "python3 post_downloader.py " + +def get_hashtag_list(): + try: + from hashtag_list import hashtag_list + return hashtag_list + except ImportError as error: + print("ImportError: " + str(error)) + print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the list hashtag_list in the file hashtag_list.py") + sys.exit() + + +def create_parser(): + # Creating the parser + parser = argparse.ArgumentParser(description="Download the tiktoks for the requested hashtags") + + # Adding the arguments + #parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags") + parser.add_argument("--h", type=str, nargs="*", help="List of hashtags") + parser.add_argument("-p", action="store_true", help="Download posts") + parser.add_argument("-v", action="store_true", help="Download videos") + + return parser + + +def set_download_settings(download_data_type): + settings = {} + settings["data"] = global_data.FILES["data"] + settings["ids"] = global_data.FILES["ids"] + settings["log"] = global_data.FILES["log"] + settings["logger"] = global_data.FILES["logger"] + settings["sleep"] = global_data.COMMANDS["sleep"] + file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir") + file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir") + if download_data_type == "posts": + settings["posts"] = global_data.FILES["posts"] + settings["post_ids"] = global_data.FILES["post_ids"] + settings["post_download"] = global_data.COMMANDS["post_download"] + settings["data_file"] = global_data.FILES["data_file"] + return settings + elif download_data_type == "videos": + settings["videos"] = global_data.FILES["videos"] + settings["video_ids"] = global_data.FILES["video_ids"] + settings["video_download"] = global_data.COMMANDS["video_download"] + settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"] + return settings + elif download_data_type == "posts-videos": + settings["posts"] = global_data.FILES["posts"] + settings["post_ids"] = global_data.FILES["post_ids"] + settings["data_file"] = global_data.FILES["data_file"] + settings["post_download"] = global_data.COMMANDS["post_download"] + settings["videos"] = global_data.FILES["videos"] + settings["video_ids"] = global_data.FILES["video_ids"] + settings["video_download"] = global_data.COMMANDS["video_download"] + settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"] + return settings + else: + print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.") + sys.exit() + + + +def get_posts(settings, tag): + file_path = file_methods.download_posts(settings, tag) + log = () + if file_path: + new_data = data_methods.extract_posts(settings, file_path, tag) + if new_data: + data_file = os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]) + data_methods.update_posts(data_file, "file", new_data[1]) + log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag) + file_methods.delete_file(file_path, "file") + + return log + + + +def get_videos(settings, tag): + log = () + download_list = file_methods.download_videos(settings, tag) + if download_list: + new_data = data_methods.extract_videos(settings, tag, download_list) + if new_data: + log = data_methods.update_videos(settings, new_data, tag) + else: + file_methods.clean_video_files(settings, tag) + return log + + + +def get_data(hashtags, download_data_type): + counter = 0 + total_hashtags = len(hashtags) + total_hashtags_offset = total_hashtags - 1 + log_data = [] + + if download_data_type == "posts": + settings = set_download_settings(download_data_type) + while counter < total_hashtags: + tag = hashtags[counter] + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir") + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file") + res = get_posts(settings, tag) + if res: + log = ( res[0], ( "posts", res[1] ) ) + log_data.append(log) + data_methods.print_total(settings["post_ids"], tag, download_data_type) + + counter += 1 + if counter < total_hashtags_offset: + time.sleep(settings["sleep"]) + elif download_data_type == "videos": + settings = set_download_settings(download_data_type) + while counter < total_hashtags: + tag = hashtags[counter] + file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir") + settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}" + settings["videos_to"] = settings['data'] + f"/{tag}/videos" + res = get_videos(settings, tag) + if res: + res = ( res[0], ( "videos", res[1])) + log_data.append(res) + data_methods.print_total(settings["video_ids"], tag, download_data_type) + + counter += 1 + if counter < total_hashtags_offset: + time.sleep(settings["sleep"]) + elif download_data_type == "posts-videos": + settings = set_download_settings(download_data_type) + while counter < total_hashtags: + tag = hashtags[counter] + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir") + file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file") + file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir") + settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}" + settings["videos_to"] = settings['data'] + f"/{tag}/videos" + requests = [("posts", "post_ids", get_posts), ("videos", "video_ids", get_videos)] + total_reqs_offset = len(requests) - 1 + req_counter = 0 + for req in requests: + res = req[2](settings, tag) + if res: + res = ( res[0], (req[0], res[1]) ) + log_data.append(res) + data_methods.print_total(settings[req[1]], tag, req[0]) + + if req_counter < total_reqs_offset: + time.sleep(settings["sleep"]) + req_counter += 1 + + counter += 1 + if counter < total_hashtags_offset: + time.sleep(settings["sleep"]) + else: + print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.") + sys.exit() + return log_data + + +def get_hashtags(file_name, hashtag_list): + try: + from hashtag_list import hashtag_list + return hashtag_list + except: + print(f"ERROR: something went wrong while reading the file {file_name}!") + raise + + +if __name__ == "__main__": + parser = create_parser() + args = parser.parse_args() + + if not (args.p or args.v): + parser.error("No argument given, please specify either -p for posts or -v videos or both.") + sys.exit() + + if args.h: + hashtags = args.h + else: + hashtags = get_hashtags("hashtag_list", "hashtag_list") + + print(hashtags) + if not hashtags: + hashtags = get_hashtag_list() + if not hashtags: + print(f"ERROR: No hashtags found. Please re-run the script with at least one hashtag!!!") + sys.exit(0) + + if (args.p and args.v): + download_data_type = "posts-videos" + elif args.p: + download_data_type = "posts" + else: + download_data_type = "videos" + + try: + log_data = get_data(hashtags, download_data_type) + if log_data: + file_methods.log_writer(log_data) + except: + raise From d6cb771f607f7e5392c96994fcdf33da50246d92 Mon Sep 17 00:00:00 2001 From: X Date: Sun, 30 Jan 2022 13:59:43 +0100 Subject: [PATCH 22/23] delete old files --- data_processor.sh | 58 ----------------------------------------------- 1 file changed, 58 deletions(-) delete mode 100644 data_processor.sh diff --git a/data_processor.sh b/data_processor.sh deleted file mode 100644 index e0b4f66..0000000 --- a/data_processor.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -counter=0 - -function join_lines { - local IFS="$1" - shift - echo "$*" -} - - -while IFS= read -r line || [ -n "$line" ]; do - if [[ -z ${line} ]]; - then - : - elif [[ ${line: -1} != '"' ]]; - then - to_combine[$counter]=$line - let "counter=counter+1" - elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]]; - then - to_combine[$counter]=$line - joined=$(join_lines " " "${to_combine[@]}") - #joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined. - echo "$joined" >> tmp.csv - unset to_combine - let "counter=0" - else - echo "$line" >> tmp.csv - fi -done < "$1" - -while IFS= read -r line || [ -n "$line" ]; do - if [[ ${line: 0} == '"' ]]; - then - if [[ -f anomalies_$1 ]]; - then - echo "${line}" >> anomalies_$1 - else - touch anomalies_$1 - echo "${line}" >> anomalies_$1 - fi - else - echo "${line}" >> clean-data_$1 - fi -done < tmp.csv - -rm -f tmp.csv - -if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]]; -then - anmls=$(wc -l anomalies_$1 | awk '{print $1}') - echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1." -else - input_data=$(wc -l $1 | awk '{print $1}') - clean_lines=$(wc -l clean-data_$1 | awk '{print $1}') - echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1." -fi From bfa90676f121dd88e070dc134791a596a104e784 Mon Sep 17 00:00:00 2001 From: X Date: Sun, 30 Jan 2022 14:00:37 +0100 Subject: [PATCH 23/23] delete old files --- extract_date.py | 56 ------------------------- extract_hashtag.py | 87 --------------------------------------- extract_posts.py | 49 ---------------------- top_hashtag_occurances.py | 83 ------------------------------------- 4 files changed, 275 deletions(-) delete mode 100644 extract_date.py delete mode 100644 extract_hashtag.py delete mode 100644 extract_posts.py delete mode 100644 top_hashtag_occurances.py diff --git a/extract_date.py b/extract_date.py deleted file mode 100644 index 788cba7..0000000 --- a/extract_date.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -import datetime -import collections -import matplotlib.pyplot as plt -import matplotlib.dates as mdates - - -if len(sys.argv) < 3: - print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag') - sys.exit() - - -def list_to_frequency(li): - if li and (type(li) == list): - return collections.Counter(li) - else: - print(f"ERROR: either {li} is empty or not a list.") - - -def eligibility_check(obj): - if not obj: - print(f'ERROR: {obj} is empty!') - return False - elif type(obj) != int: - print(f'ERROR: {obj} is not an integer as is expected!') - return False - else: - return True - -with open(sys.argv[1]) as file: - object = json.load(file) - l = len(object) - date_list = [] - for i in range(0, l): - obj = object[i]["createTime"] - if eligibility_check(obj): - dt_obj = datetime.datetime.fromtimestamp(obj) - date_list.append(dt_obj.date()) - else: - print(f'ERROR: Some error occured. Check {obj}.') - ordered = dict(list_to_frequency(date_list)) - dates = list(ordered.keys()) - total_dates = len(dates) - frequency = list(ordered.values()) - plt.scatter(dates, frequency) - plt.gcf().autofmt_xdate() - date_format = mdates.DateFormatter('%d-%m-%Y') - plt.gca().xaxis.set_major_formatter(date_format) - plt.tight_layout() - plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}') - plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)') - plt.ylabel('Posts') - plt.show() diff --git a/extract_hashtag.py b/extract_hashtag.py deleted file mode 100644 index 0b9e1fc..0000000 --- a/extract_hashtag.py +++ /dev/null @@ -1,87 +0,0 @@ -import os, sys -import csv, json -import argparse -import matplotlib.pyplot as plt - - - -def get_hashtags(obj): - if not obj: - print(f'ERROR: Empty item, no hashtags to be extracted.') - return - else: - hashtags = {} - l = len(obj) - for i in range(l): - for hashtag in obj[i]['hashtags']: - if hashtag['name'] in hashtags: - hashtags[hashtag['name']].add(i) - else: - hashtags[hashtag['name']] = {i} - return hashtags - - -def get_occurances(filename, n=1 , sort=True): - with open(filename) as f: - obj = json.load(f) - l = len(obj) - tags = get_hashtags(obj) - tags = {key: (len(value), value) for (key, value) in tags.items()} - if not sort: - k = list(tags.keys()) - v = list(tags.values()) - return obj, k, v - else: - sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} - k = list(sorted_tags.keys()) - v = list(sorted_tags.values()) - k = k[:n] - v_total = [i[0] for i in v] - v_total = v_total[:n] - return l, k, v_total - - - -def plot(n, length, k, v): - plt.scatter(k, v) - plt.tight_layout() - plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {n} hashtags from {length} posts.') - plt.ylabel(f'Number of occurances') - plt.show() - return - - -def print_occurances(k, v): - row_number = 0 - print(f'Hashtag Occurances') - for key,value in zip(k, v): - print(f'{row_number}\t{key}\t\t{value}') - row_number += 1 - return - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("input_file", help="The json hashtag file name") - parser.add_argument("n", help="The number of top n occurances", type=int) - parser.add_argument("-p", "--plot", help="Plot the occurances", action="store_true") - parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true") - args = parser.parse_args() - if args.input_file and args.n: - if args.n < 1: - print(f"Please make sure the number of top occurances is a positive integer.") - sys.exit() - - base = os.path.splitext(args.input_file)[0] - path = f"./{base}_sorted_hashtags.csv" - if args.plot: - length, keys, values = get_occurances(args.input_file, args.n) - plot(args.n, length, keys, values) - else: - length, keys, values = get_occurances(args.input_file, args.n) - print_occurances(keys, values) - else: - print(f'ERROR: either {args.input_file} or {args.n} or both contains error.') - diff --git a/extract_posts.py b/extract_posts.py deleted file mode 100644 index 721393b..0000000 --- a/extract_posts.py +++ /dev/null @@ -1,49 +0,0 @@ -import os, sys -from extract_hashtag import get_occurances - - -def filter_positions(hashtags, keys, positions): - filtered = [] - for hashtag in hashtags: - try: - i = keys.index(hashtag) - key = keys[i] - post_indices = positions[i][1] - filtered.append((key, post_indices)) - except Exception as error: - print(error) - continue - return filtered - - -def write_posts(path, obj, filtered): - length = len(filtered) - with open(path, "w") as output_file: - for i in range(length): - hashtag = filtered[i][0] - total_positions = len(filtered[i][1]) - positions = list(filtered[i][1]) - first_position = positions[0] - output_file.write(f"{hashtag}, {obj[first_position]}" + "\n") - for p in range(1, total_positions): - output_file.write(f" , {obj[positions[p]]}" + "\n") - print(f"{total_positions} posts written for the hashtag - {hashtag}") - - -if __name__ == "__main__": - file_name = sys.argv[1] - hashtags = list(sys.argv[2:]) - name = f"{hashtags[0]}_{len(hashtags)}" - path = f"../{name}_posts.csv" - if os.path.exists(path): - print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.') - sys.exit() - else: - obj, keys, positions = get_occurances(file_name, sort=False) - filtered = filter_positions(hashtags, keys, positions) - if filtered: - write_posts(path, obj, filtered) - else: - print(f"No posts found for the hashtags you entered.") - - diff --git a/top_hashtag_occurances.py b/top_hashtag_occurances.py deleted file mode 100644 index dcee5aa..0000000 --- a/top_hashtag_occurances.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/python3 - -import os, time -import json -import argparse -from datetime import datetime - - -def parser(): - parser = argparse.ArgumentParser() - parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+") - parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int) - args = parser.parse_args() - return args - - -def check_file_existence(hashtag, contains=None): - pwd = "./" - for i in os.listdir(pwd): - #if os.path.isfile(os.path.join(pwd, i)) and hashtag in i: - if hashtag in i and contains in i: - return i - elif hashtag in i: - return i - else: - continue - return - - -def get_input_file(hashtag): - check_file = check_file_existence(hashtag, "json") - if check_file: - return check_file - else: - try: - os.system(f"tiktok-scraper hashtag {hashtag} -t json") - c = check_file_existence(hashtag, "json") - if c: - return c - else: - print(f"ERROR: No json file relating to {hashtag} found.") - except: - raise - - -def copy_data(input_file, output_file): - os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}") - return - - -def get_data(hashtag, n): - input_file = get_input_file(hashtag) - if input_file: - os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o") - base = os.path.splitext(input_file)[0] - data_file = f"{base}_sorted_hashtags.csv" - if os.path.exists(data_file): - return data_file - return - - -def get_occurances(hashtag, n, output): - data_file = get_data(hashtag, n) - copy_data(data_file, output) - os.system(f"rm {data_file}") - print(f"{data_file} removed ....") - - -if __name__ == "__main__": - args = parser() - hashtags = args.hashtags - now = datetime.now().strftime("%d%m%Y-%H%M%S") - output = f"./{now}.csv" - l = len(hashtags) - if l > 1: - sleep = 30 # Sleep time (in secs) between two tiktok scraping requests. - get_occurances(hashtags[0], args.top_n, output) - for i in range(1, l): - time.sleep(30) - get_occurances(hashtags[i], args.top_n, output) - else: - get_occurances(hashtags[0], args.top_n, output) - print(f"The output data is stored in the file {output}")