mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-12 21:38:30 +03:00
Add files via upload
This commit is contained in:
58
data_processor.sh
Normal file
58
data_processor.sh
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
counter=0
|
||||
|
||||
function join_lines {
|
||||
local IFS="$1"
|
||||
shift
|
||||
echo "$*"
|
||||
}
|
||||
|
||||
|
||||
while IFS= read -r line || [ -n "$line" ]; do
|
||||
if [[ -z ${line} ]];
|
||||
then
|
||||
:
|
||||
elif [[ ${line: -1} != '"' ]];
|
||||
then
|
||||
to_combine[$counter]=$line
|
||||
let "counter=counter+1"
|
||||
elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]];
|
||||
then
|
||||
to_combine[$counter]=$line
|
||||
joined=$(join_lines " " "${to_combine[@]}")
|
||||
#joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined.
|
||||
echo "$joined" >> tmp.csv
|
||||
unset to_combine
|
||||
let "counter=0"
|
||||
else
|
||||
echo "$line" >> tmp.csv
|
||||
fi
|
||||
done < "$1"
|
||||
|
||||
while IFS= read -r line || [ -n "$line" ]; do
|
||||
if [[ ${line: 0} == '"' ]];
|
||||
then
|
||||
if [[ -f anomalies_$1 ]];
|
||||
then
|
||||
echo "${line}" >> anomalies_$1
|
||||
else
|
||||
touch anomalies_$1
|
||||
echo "${line}" >> anomalies_$1
|
||||
fi
|
||||
else
|
||||
echo "${line}" >> clean-data_$1
|
||||
fi
|
||||
done < tmp.csv
|
||||
|
||||
rm -f tmp.csv
|
||||
|
||||
if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]];
|
||||
then
|
||||
anmls=$(wc -l anomalies_$1 | awk '{print $1}')
|
||||
echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1."
|
||||
else
|
||||
input_data=$(wc -l $1 | awk '{print $1}')
|
||||
clean_lines=$(wc -l clean-data_$1 | awk '{print $1}')
|
||||
echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1."
|
||||
fi
|
||||
56
extract_date.py
Normal file
56
extract_date.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
import datetime
|
||||
import collections
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag')
|
||||
sys.exit()
|
||||
|
||||
|
||||
def list_to_frequency(li):
|
||||
if li and (type(li) == list):
|
||||
return collections.Counter(li)
|
||||
else:
|
||||
print(f"ERROR: either {li} is empty or not a list.")
|
||||
|
||||
|
||||
def eligibility_check(obj):
|
||||
if not obj:
|
||||
print(f'ERROR: {obj} is empty!')
|
||||
return False
|
||||
elif type(obj) != int:
|
||||
print(f'ERROR: {obj} is not an integer as is expected!')
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
with open(sys.argv[1]) as file:
|
||||
object = json.load(file)
|
||||
l = len(object)
|
||||
date_list = []
|
||||
for i in range(0, l):
|
||||
obj = object[i]["createTime"]
|
||||
if eligibility_check(obj):
|
||||
dt_obj = datetime.datetime.fromtimestamp(obj)
|
||||
date_list.append(dt_obj.date())
|
||||
else:
|
||||
print(f'ERROR: Some error occured. Check {obj}.')
|
||||
ordered = dict(list_to_frequency(date_list))
|
||||
dates = list(ordered.keys())
|
||||
total_dates = len(dates)
|
||||
frequency = list(ordered.values())
|
||||
plt.scatter(dates, frequency)
|
||||
plt.gcf().autofmt_xdate()
|
||||
date_format = mdates.DateFormatter('%d-%m-%Y')
|
||||
plt.gca().xaxis.set_major_formatter(date_format)
|
||||
plt.tight_layout()
|
||||
plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}')
|
||||
plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)')
|
||||
plt.ylabel('Posts')
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user