Files
o9a-product-scripts/get_deleted_o9a_articles.py
2023-05-26 05:15:11 -05:00

147 lines
4.8 KiB
Python

"""Identify all articles from o9a.org that have been captured on Wayback Machine but
have since been deleted from the o9a.org website, and search for articles containing
the term "tempel".
"""
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
# URL of Wayback Machine search API
BASE_URL = "http://web.archive.org/cdx/search/cdx"
# If archived URL contains these strings, ignore it
IGNORE_URL_STRINGS = ["wp-json", "wp-content", "/uploads/", "/page/"]
# Search articles for this string
SEARCH_TERM = "tempel"
# Save table of information for deleted articles to this file
OUTPUT_CSV = "o9a_deleted_articles.csv"
def get_wayback_url(row):
"""Convert timestamp and original URL into Wayback Machine URL"""
return f'https://web.archive.org/web/{row["timestamp"]}/{row["original"]}'
def process_archived_url(url):
"""Convert original archived URL to standard form"""
if "/20" not in url:
return None
_url = url.split("?")[0]
_url = _url.replace(".org:80/", ".org/").replace("http://", "https://").strip("/")
if any(s in _url for s in IGNORE_URL_STRINGS):
return None
if re.match("^https://www.o9a.org/\d{4}/\d{2}$", _url):
return None
if _url.endswith("/embed"):
return None
return _url
def _get(url):
"""Wrapper for retrying request multiple times."""
n_retries = 0
while n_retries < 5:
time.sleep(2**n_retries - 1)
try:
response = requests.get(url=url, timeout=15)
if response.status_code == 200:
return response
else:
n_retries += 1
except Exception:
n_retries += 1
raise ValueError(
f"Maximum number of retries reached for GET request with url {url}"
)
def process_article(soup):
"""Extract relevant information from HTML of o9a.org article"""
content_soup = soup.select_one("div#content")
content = content_soup.text.strip()
tags_split = content.split("| Tags: ")
if len(tags_split) > 1:
tags = tags_split[1].split(" | ")[0].split(", ")
else:
tags = []
data = {
"wayback_url": soup.find("link", {"rel": "canonical"})["href"],
"title": content_soup.find("h1").text,
"author": content.split("| Author: ")[1].split(" | ")[0],
"date": content.split("Posted: ")[1].split(" | ")[0],
"content": content.lower(),
"links": [a["href"] for a in content_soup.find_all("a", href=True)],
"tags": tags,
}
return data
if __name__ == "__main__":
# Get all archived pages from the o9a.org website, store in DataFrame
capture_list = []
page = 0
out_of_pages = False
while not out_of_pages:
params = {
"page": page,
"url": f"o9a.org/*",
"output": "json",
}
r = requests.get(url=BASE_URL, params=params)
if r.text == "":
out_of_pages = True
break
result = r.json()
capture_list.append(pd.DataFrame(data=result[1:], columns=result[0]))
page += 1
captures = pd.concat(capture_list)
captures["datetime"] = pd.to_datetime(captures["timestamp"])
captures["url"] = captures.apply(get_wayback_url, axis="columns")
captures = (
captures[captures["statuscode"] == "200"]
.sort_values("timestamp")
.reset_index(drop=True)
)
captures["processed_url"] = captures["original"].apply(process_archived_url)
captures = captures.drop_duplicates(subset="processed_url", keep="first")
# Get all URLs from the o9a.org sitemap, compare with archived URLs to find deleted articles
r = requests.get("https://www.o9a.org/wp-sitemap-posts-post-1.xml")
soup = BeautifulSoup(r.content, features="lxml")
article_urls_sitemap = set([loc.text.strip("/") for loc in soup.select("url loc")])
article_urls_wayback = set(captures["processed_url"].dropna())
deleted_urls = article_urls_wayback - article_urls_sitemap
urls_to_download = captures[captures["processed_url"].isin(deleted_urls)]["url"]
# Download all deleted pages, process into DataFrame and save to CSV
article_data = []
for url in urls_to_download:
r = _get(url)
soup = BeautifulSoup(r.content, features="lxml")
article_data.append(process_article(soup))
articles = pd.DataFrame(article_data)
articles["date"] = pd.to_datetime(articles["date"])
articles = articles.sort_values("date").reset_index(drop=True)
articles.to_csv(path_or_buf=OUTPUT_CSV, index=False, quoting=2)
# Search for specified search term in deleted article contents
relevant_articles = articles[
articles["content"].str.contains(SEARCH_TERM, na=False)
][["date", "wayback_url"]]
for date, url in relevant_articles.values:
print(date, url)