rebase

2026-06-11 04:48:30 +03:00 · 2022-01-30 13:51:08 +01:00
parent 2d3f4a9aab
commit 2a34e03dc8
8 changed files with 713 additions and 0 deletions
--- a/analytics/hashtag_frequencies.py
+++ b/analytics/hashtag_frequencies.py
@@ -0,0 +1,90 @@
+import os, sys
+import csv, json
+import argparse
+import matplotlib.pyplot as plt
+
+
+
+def get_hashtags(obj):
+    if not obj:
+        print(f'ERROR: Empty item, no hashtags to be extracted.')
+        return
+    else:
+        hashtags = {}
+        l = len(obj)
+        for i in range(l):
+            for hashtag in obj[i]['hashtags']:
+                if hashtag['name'] in hashtags:
+                    hashtags[hashtag['name']].add(i)
+                else:
+                    hashtags[hashtag['name']] = {i}
+    return hashtags
+
+
+def get_occurrences(filename, n=1 , sort=True):
+    with open(filename) as f:
+        obj = json.load(f)
+        l = len(obj)
+        tags = get_hashtags(obj)
+        tags = {key: (len(value), value) for (key, value) in tags.items()}
+        if not sort:
+            k = list(tags.keys())
+            v = list(tags.values())
+            return obj, k, v 
+        else:
+            sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
+            k = list(sorted_tags.keys())
+            v = list(sorted_tags.values())
+            k = k[:n]
+            v_total = [i[0] for i in v]
+            v_total = v_total[:n]
+            return l, k, v_total
+
+
+
+def plot(n, length, k, v):
+    plt.scatter(k, v)
+    plt.tight_layout()
+    plt.title(f'Hashtag Distribution')
+    plt.xlabel(f'Top {n} hashtags from {length} posts.')
+    plt.ylabel(f'Number of occurrences')
+    plt.show()
+    return
+
+
+def print_occurrences(l, k, v):
+    row_number = 0
+    total_posts = l
+    print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))'))
+    #print(f'Hashtag                  Occurrences                 Frequency(Occurances/Total-Posts)')
+    for key,value in zip(k, v):
+        ratio = value/total_posts 
+        print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
+        #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}')
+        row_number += 1
+    return
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_file", help="The json hashtag file name")
+    parser.add_argument("n", help="The number of top n occurrences", type=int)
+    parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true")
+    parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
+    args = parser.parse_args()
+    if args.input_file and args.n:
+        if args.n < 1:
+            print(f"Please make sure the number of top occurrences is a positive integer.")
+            sys.exit()
+
+        base = os.path.splitext(args.input_file)[0]
+        path = f"./{base}_sorted_hashtags.csv"
+        if args.plot:
+            length, keys, values = get_occurrences(args.input_file, args.n)
+            plot(args.n, length, keys, values)
+        else:
+            length, keys, values = get_occurrences(args.input_file, args.n)
+            print_occurrences(length, keys, values)
+    else:
+        print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')
--- a/analytics/logging_analytics.py
+++ b/analytics/logging_analytics.py
@@ -0,0 +1,4 @@
+"""
+Yet to be written ...
+"""
+
--- a/tiktok_downloader/data_methods.py
+++ b/tiktok_downloader/data_methods.py
@@ -0,0 +1,123 @@
+import os
+from collections import namedtuple
+from datetime import datetime
+import global_data
+import file_methods
+
+
+Difference = namedtuple("Difference", "new_ids size")
+Total = namedtuple("Total", "total unique")
+
+
+def get_difference(tag, file, ids):
+    maiden_entry = False
+    current_id_data = file_methods.get_data(file)
+    if tag in current_id_data:
+        current_ids = current_id_data[tag]
+        set1 = set(current_ids)
+        set2 = set(ids)
+        new_ids = set2.difference(set1)
+        if new_ids:
+            new_ids = list(new_ids)
+            size = len(new_ids)
+            diff = Difference(new_ids, size)
+            return (diff, maiden_entry)
+        else:
+            return ([], maiden_entry)
+    else:
+        maiden_entry = True
+        return (ids, maiden_entry)
+
+
+def extract_posts(settings, file_name, tag):
+    ids = []
+    posts = []
+    new_posts = []
+
+    posts = file_methods.get_data(file_name)
+    for post in posts:
+        ids.append(post["id"])
+    if not ids:
+        print(f"WARNING: no posts were found for {tag} in the file - {file_name}")
+        return
+   
+    status = file_methods.check_existence(settings["post_ids"], "file")
+    if not status:
+        new_data = (ids, posts)
+        return new_data
+    else:
+        res = get_difference(tag, settings["post_ids"], ids)
+        if res[1]:
+            new_data = (ids, posts)
+            return new_data
+        else:
+            if res[0]:
+                for i in res[0].new_ids:
+                    for post in posts:
+                        if (i == post["id"]):
+                            new_posts.append(post)
+                new_data = (res[0].new_ids, new_posts)
+                return new_data
+            else:
+                print(f"WARNING: No new posts were found in the downloaded file - {file_name}")
+                return
+
+
+def extract_videos(settings, tag, download_list):
+    status = file_methods.check_existence(settings["video_ids"], "file")
+    if not status:
+        new_data = download_list
+        return new_data
+    else:
+        res = get_difference(tag, settings["video_ids"], download_list)
+        if res[1]:
+            return download_list
+        else:
+            if res[0]:
+                new_data = res[0].new_ids
+                return new_data
+            else:
+                print(f"WARNING: No new videos were found for the {tag} in the downloaded folder.")
+                return
+
+
+def update_posts(file_path, file_type, new_data, tag=None):
+    try:
+        status = file_methods.check_existence(file_path, file_type)
+        if not tag:
+            file_methods.post_writer(file_path, new_data, status)
+        else:
+            log = file_methods.id_writer(file_path, new_data, tag, status)
+            return log
+    except:
+        raise
+
+
+def update_videos(settings, new_data, tag):
+    file_path = settings["video_ids"]
+    file_methods.check_file(file_path, "file")
+    log = file_methods.id_writer(file_path, new_data, tag, True)
+    file_methods.clean_video_files(settings, tag, new_data)
+    return log
+
+
+def get_total_posts(file_path, tag):
+    status = file_methods.check_existence(file_path, "file")
+    if not status:
+        raise OSError("{file_path} not found!")
+    else:
+        data = file_methods.get_data(file_path)
+        total = len(data[tag])
+        unique = len(set(data[tag]))
+        total = Total(total, unique)
+        return total
+
+
+def print_total(file_path, tag, data_type):
+    total = get_total_posts(file_path, tag)
+    if (total.total == total.unique):
+        print(f"Total {data_type} for the hashtag {tag} are: {total.total}")
+        return
+    else:
+        print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...")
+        return
--- a/tiktok_downloader/file_methods.py
+++ b/tiktok_downloader/file_methods.py
@@ -0,0 +1,201 @@
+import os, json, subprocess
+from datetime import datetime
+import global_data
+import data_methods
+
+
+def create_file(name, file_type):
+    if (file_type == "dir"):
+        os.makedirs(name, mode=0o777)
+    elif (file_type == "file"):
+        with open(name, "w"): pass
+    else:
+        print(f"ERROR: either {file_type} or is not well defined.")
+    return
+
+
+def check_existence(file_path, file_type):
+    if (file_type == "file"):
+        if os.path.isfile(file_path):
+            return True
+        else:
+            return False
+    elif (file_type == "dir"):
+        if os.path.isdir(file_path):
+            return True
+        else:
+            return False
+    else:
+        raise OSError(f"{file_type} has to be a 'dir' or a 'file'!!!")
+
+
+def check_file(file_path, file_type):
+    status = check_existence(file_path, file_type)
+    if not status:
+        create_file(file_path, file_type)    
+
+    return
+
+
+def download_posts(settings, tag):
+    path = os.path.join(settings["data"], tag, settings["posts"])
+    os.chdir(path)
+    try:
+        tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" 
+        result = subprocess.run([tiktok_command], capture_output=True, shell=True)
+        if result.stdout:
+            new_file = result.stdout.decode('utf-8').split()[-1]
+            if ("json" in new_file):
+                os.chdir("../../../tiktok_downloader")
+                return new_file 
+            else:
+                print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!")
+                os.chdir("../../../tiktok_downloader")
+                return
+        else:
+            os.chdir("../../../tiktok_downloader")
+            print(f"ERROR: No file was downloaded by the tiktok-scraper for the {tag} !!!!")
+            return
+    except: raise
+
+
+
+def download_videos(settings, tag):
+    path = os.path.join(settings["data"], tag, settings["videos"])
+    os.chdir(path)
+    try:
+        tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" 
+        result = subprocess.run([tiktok_command], capture_output=True, shell=True)
+        if result.stdout:
+            downloaded_list_tmp = os.listdir(f"./#{tag}")
+            if downloaded_list_tmp:
+                downloaded_list = []
+                for file in downloaded_list_tmp:
+                    file = file[0:-4]
+                    downloaded_list.append(file)
+                
+                os.chdir("../../../tiktok_downloader")
+                return downloaded_list
+            else:
+                print(f"WARNING: No video files were downloaded for the hashtag {tag}.")
+                os.chdir("../../../tiktok_downloader")
+                subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True)
+        else:
+            os.chdir("../../../tiktok_downloader")
+            print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!")
+            return
+        
+    except: raise
+
+
+def get_data(file_path):
+    with open(file_path, "r") as f:
+        data = json.load(f)
+        return data
+
+
+def dump_data(file_path, data):
+    with open(file_path, "w") as f:
+        json.dump(data, f)
+        return            
+
+def log_writer(log_data):
+    total = 0
+    try:
+        log_dict = {}
+        for ele in log_data:
+            if ele[0] in log_dict:
+                if ele[1][0] in log_dict[ele[0]]:
+                    log_dict[ele[0]][ele[1][0]] += ele[1][1]
+                else:
+                    log_dict[ele[0]][ele[1][0]] = ele[1][1]
+                total += ele[1][1]
+            else:
+                log_dict[ele[0]] = { ele[1][0] : ele[1][1] }
+                total += ele[1][1]
+
+        logger = global_data.FILES["logger"]
+        now = datetime.now()
+        now_str = now.strftime("%d-%m-%Y %H:%M:%S")
+        status = check_existence(logger, "file")
+        if status:
+            data = get_data(logger)
+            data[now_str] = log_dict
+            dump_data(logger, data)
+        else:
+            data = { now_str : log_dict }
+            dump_data(logger, data)
+        print(f"Successfully logged {total} entries!!!!")
+        return
+    except: raise
+
+
+def id_writer(file_path, new_data, tag, status):
+    try:
+        total = len(new_data)
+        if status:
+            try:
+                data = get_data(file_path)
+                if tag in data:
+                    data[tag] += new_data
+                else:
+                    data[tag]= new_data 
+                dump_data(file_path, data)
+            except json.decoder.JSONDecodeError:
+                data = { tag : new_data }
+                dump_data(file_path, data)
+        else:
+            data = { tag : new_data }
+            dump_data(file_path, data)
+        print(f"SUCCESS - {total} entries added to {file_path}!!!")
+        log_data = (tag, total)
+        return log_data
+    except: raise
+
+
+def post_writer(file_path, new_data, status):
+    try:
+        total = len(new_data)
+        if status:
+            try:
+                data = get_data(file_path)
+                data += new_data
+                dump_data(file_path, data)
+            except json.decoder.JSONDecodeError:
+                data = new_data
+                dump_data(file_path, data)
+        else:
+            data = new_data
+            dump_data(file_path, data)
+        print(f"SUCCESS - {total} entries added to {file_path}!!!")
+        return
+    except: raise
+
+
+def delete_file(file_path, file_type):
+    if not check_existence(file_path, file_type):
+        print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!")
+    elif (file_type == "file"):
+        os.remove(file_path)
+        print(f"Successfully deleted {file_path}!!!")
+        return
+    elif (file_type == "dir"):
+        os.rmdir(file_path)
+        print(f"Successfully deleted {file_path}!!!")
+        return
+    else:
+        print(f"ERROR: {file_type} needs to be either 'file' or 'dir' !!!")
+        return
+
+
+def clean_video_files(settings, tag, new_data=None):
+    try:
+        if new_data:
+            for file in new_data:
+                settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4"
+                subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True)
+             
+        subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True)
+        print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.")
+    except:
+        raise
--- a/tiktok_downloader/global_data.py
+++ b/tiktok_downloader/global_data.py
@@ -0,0 +1,38 @@
+# Directories
+DATA = "../data"
+IDS = "ids"
+LOG = "log"
+POSTS = "posts"
+VIDEOS = "videos"
+
+# Files
+POST_IDS = "post_ids.json"
+VIDEO_IDS = "video_ids.json"
+DATA_FILE = "data.json"
+LOG_FILE = "log.json"
+
+
+FILES = {
+            "data" : DATA,
+            "ids" : IDS,
+            "log" : LOG,
+            "posts" : POSTS,
+            "videos" : VIDEOS,
+            "post_ids" : f"{DATA}/{IDS}/{POST_IDS}",
+            "video_ids" : f"{DATA}/{IDS}/{VIDEO_IDS}",
+            "data_file" : f"{DATA_FILE}",
+            "downloads" : [],
+            "logger" : f"{DATA}/{LOG}/{LOG_FILE}",
+        }
+
+
+
+# Commands
+tag = ""
+
+COMMANDS = {
+            "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper.
+            "post_download" : f"tiktok-scraper hashtag {tag} -t 'json'",
+            "video_download" : f"tiktok-scraper hashtag {tag} -d",
+            "sleep" : 8
+        }
--- a/tiktok_downloader/hashtag_list.py
+++ b/tiktok_downloader/hashtag_list.py
@@ -0,0 +1,37 @@
+hashtag_list = [
+# This is a sample hashtag list. Please enter your hashtag list (without the comment).
+#            "london",
+#            "paris",
+#            "newyork",
+#            "tokyo"
+            "uyghur",
+            "uyghur2021",
+            "uyghur2022",
+            "uyghurmuslims",
+            "xinjiang",
+            "xinjiangchina",
+            "xinjiangcotton",
+            "xinjiangtravel",
+            "uyghurlivesmatter",
+            "uighur",
+            "Uighurs",
+            "Uyghurs",
+            "uighuren",
+            "saveuyghur",
+            "uighurmuslims",
+            "chinesemuslim",
+            "uyghurpeople",
+            "urumqi",
+            "chinaxinjiang",
+            "xinjianguyghurs",
+            "eastturkestan",
+            "chinaconcentrationcamp",
+            "xinjianguyghur🇨🇳",
+            "kashgar",
+            "xinjiangreeducationcamps",
+            "uyghur_tiktok",
+            "uyghurreality",
+            "xinjiangdance",
+            "westernmedia",
+            "uyghurgenocide"
+        ]
--- a/tiktok_downloader/hashtag_list_sample.py
+++ b/tiktok_downloader/hashtag_list_sample.py
@@ -0,0 +1,8 @@
+hashtag_list = [
+# This is a sample hashtag list. Please enter your hashtag list (without the comment).
+            "london",
+            "paris",
+            "newyork",
+            "tokyo"
+
+        ]
--- a/tiktok_downloader/run_downloader.py
+++ b/tiktok_downloader/run_downloader.py
@@ -0,0 +1,212 @@
+import os, sys
+import time
+import json
+import argparse
+
+import global_data
+import file_methods
+import data_methods
+
+
+
+command = "python3 post_downloader.py "
+
+def get_hashtag_list():
+    try:
+        from hashtag_list import hashtag_list
+        return hashtag_list
+    except ImportError as error:
+        print("ImportError: " + str(error))
+        print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the list hashtag_list in the file hashtag_list.py")
+        sys.exit()
+
+
+def create_parser():
+    # Creating the parser
+    parser = argparse.ArgumentParser(description="Download the tiktoks for the requested hashtags")
+
+    # Adding the arguments
+    #parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags")
+    parser.add_argument("--h", type=str, nargs="*", help="List of hashtags")
+    parser.add_argument("-p", action="store_true", help="Download posts")
+    parser.add_argument("-v", action="store_true", help="Download videos")
+
+    return parser
+
+
+def set_download_settings(download_data_type):
+    settings = {}
+    settings["data"] = global_data.FILES["data"]
+    settings["ids"] = global_data.FILES["ids"]
+    settings["log"] = global_data.FILES["log"]
+    settings["logger"] = global_data.FILES["logger"]
+    settings["sleep"] = global_data.COMMANDS["sleep"]
+    file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
+    file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir")
+    if download_data_type == "posts":
+        settings["posts"] = global_data.FILES["posts"]
+        settings["post_ids"] = global_data.FILES["post_ids"]
+        settings["post_download"] = global_data.COMMANDS["post_download"]
+        settings["data_file"] = global_data.FILES["data_file"]
+        return settings
+    elif download_data_type == "videos":
+        settings["videos"] = global_data.FILES["videos"]
+        settings["video_ids"] = global_data.FILES["video_ids"]
+        settings["video_download"] = global_data.COMMANDS["video_download"]
+        settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"]
+        return settings
+    elif download_data_type == "posts-videos":
+        settings["posts"] = global_data.FILES["posts"]
+        settings["post_ids"] = global_data.FILES["post_ids"]
+        settings["data_file"] = global_data.FILES["data_file"]
+        settings["post_download"] = global_data.COMMANDS["post_download"]
+        settings["videos"] = global_data.FILES["videos"]
+        settings["video_ids"] = global_data.FILES["video_ids"]
+        settings["video_download"] = global_data.COMMANDS["video_download"]
+        settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"]
+        return settings
+    else:
+        print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
+        sys.exit()
+
+
+
+def get_posts(settings, tag):
+    file_path = file_methods.download_posts(settings, tag)
+    log = ()
+    if file_path:
+        new_data = data_methods.extract_posts(settings, file_path, tag)
+        if new_data:
+            data_file = os.path.join(settings["data"], tag, settings["posts"], settings["data_file"])
+            data_methods.update_posts(data_file, "file", new_data[1])
+            log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag)
+        file_methods.delete_file(file_path, "file")
+    
+    return log
+
+
+
+def get_videos(settings, tag):    
+    log = ()
+    download_list = file_methods.download_videos(settings, tag)
+    if download_list:
+        new_data = data_methods.extract_videos(settings, tag, download_list)
+        if new_data:
+            log = data_methods.update_videos(settings, new_data, tag)
+        else:
+            file_methods.clean_video_files(settings, tag)
+    return log
+
+
+
+def get_data(hashtags, download_data_type):
+    counter = 0
+    total_hashtags = len(hashtags)
+    total_hashtags_offset = total_hashtags - 1
+    log_data = []
+    
+    if download_data_type == "posts":
+        settings = set_download_settings(download_data_type)
+        while counter < total_hashtags:
+            tag = hashtags[counter]
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir")
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file")
+            res = get_posts(settings, tag)
+            if res:
+                log = ( res[0], ( "posts", res[1] ) )
+                log_data.append(log)
+                data_methods.print_total(settings["post_ids"], tag, download_data_type)
+            
+            counter += 1
+            if counter < total_hashtags_offset:
+                time.sleep(settings["sleep"])
+    elif download_data_type == "videos":
+        settings = set_download_settings(download_data_type)
+        while counter < total_hashtags:
+            tag = hashtags[counter]
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir")
+            settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}"
+            settings["videos_to"] = settings['data'] + f"/{tag}/videos"
+            res = get_videos(settings, tag)
+            if res:
+                res = ( res[0], ( "videos", res[1]))
+                log_data.append(res)
+                data_methods.print_total(settings["video_ids"], tag, download_data_type)
+ 
+            counter += 1
+            if counter < total_hashtags_offset:
+                time.sleep(settings["sleep"])
+    elif download_data_type == "posts-videos":
+        settings = set_download_settings(download_data_type)
+        while counter < total_hashtags:
+            tag = hashtags[counter]
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir")
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file")
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir")
+            settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}"
+            settings["videos_to"] = settings['data'] + f"/{tag}/videos"
+            requests = [("posts", "post_ids", get_posts), ("videos", "video_ids", get_videos)]
+            total_reqs_offset = len(requests) - 1
+            req_counter = 0
+            for req in requests:
+                res = req[2](settings, tag)
+                if res:
+                    res = ( res[0], (req[0], res[1]) )
+                    log_data.append(res)
+                    data_methods.print_total(settings[req[1]], tag, req[0])
+                
+                if req_counter < total_reqs_offset:
+                    time.sleep(settings["sleep"])
+                    req_counter += 1
+
+            counter += 1
+            if counter < total_hashtags_offset:
+                time.sleep(settings["sleep"])
+    else:
+        print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
+        sys.exit()
+    return log_data
+
+
+def get_hashtags(file_name, hashtag_list):
+    try:
+        from hashtag_list import hashtag_list
+        return hashtag_list
+    except:
+        print(f"ERROR: something went wrong while reading the file {file_name}!")
+        raise
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args = parser.parse_args()
+
+    if not (args.p or args.v):
+        parser.error("No argument given, please specify either -p for posts or -v videos or both.")
+        sys.exit()
+    
+    if args.h:
+        hashtags = args.h
+    else:
+        hashtags = get_hashtags("hashtag_list", "hashtag_list")
+
+    print(hashtags)
+    if not hashtags:
+        hashtags = get_hashtag_list()
+        if not hashtags:
+            print(f"ERROR: No hashtags found. Please re-run the script with at least one hashtag!!!")
+            sys.exit(0)
+
+    if (args.p and args.v):
+        download_data_type = "posts-videos"
+    elif args.p:
+        download_data_type = "posts"
+    else:
+        download_data_type = "videos"
+   
+    try: 
+        log_data = get_data(hashtags, download_data_type)
+        if log_data:
+            file_methods.log_writer(log_data)
+    except:
+        raise