mirror of
https://github.com/bellingcat/o9a-product-scripts.git
synced 2026-06-07 19:18:35 +03:00
initial commit
This commit is contained in:
146
get_deleted_o9a_articles.py
Normal file
146
get_deleted_o9a_articles.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
"""Identify all articles from o9a.org that have been captured on Wayback Machine but
|
||||||
|
have since been deleted from the o9a.org website, and search for articles containing
|
||||||
|
the term "tempel".
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# URL of Wayback Machine search API
|
||||||
|
BASE_URL = "http://web.archive.org/cdx/search/cdx"
|
||||||
|
|
||||||
|
# If archived URL contains these strings, ignore it
|
||||||
|
IGNORE_URL_STRINGS = ["wp-json", "wp-content", "/uploads/", "/page/"]
|
||||||
|
|
||||||
|
# Search articles for this string
|
||||||
|
SEARCH_TERM = "tempel"
|
||||||
|
|
||||||
|
# Save table of information for deleted articles to this file
|
||||||
|
OUTPUT_CSV = "o9a_deleted_articles.csv"
|
||||||
|
|
||||||
|
|
||||||
|
def get_wayback_url(row):
|
||||||
|
"""Convert timestamp and original URL into Wayback Machine URL"""
|
||||||
|
|
||||||
|
return f'https://web.archive.org/web/{row["timestamp"]}/{row["original"]}'
|
||||||
|
|
||||||
|
|
||||||
|
def process_archived_url(url):
|
||||||
|
"""Convert original archived URL to standard form"""
|
||||||
|
|
||||||
|
if "/20" not in url:
|
||||||
|
return None
|
||||||
|
_url = url.split("?")[0]
|
||||||
|
_url = _url.replace(".org:80/", ".org/").replace("http://", "https://").strip("/")
|
||||||
|
if any(s in _url for s in IGNORE_URL_STRINGS):
|
||||||
|
return None
|
||||||
|
if re.match("^https://www.o9a.org/\d{4}/\d{2}$", _url):
|
||||||
|
return None
|
||||||
|
if _url.endswith("/embed"):
|
||||||
|
return None
|
||||||
|
return _url
|
||||||
|
|
||||||
|
|
||||||
|
def _get(url):
|
||||||
|
"""Wrapper for retrying request multiple times."""
|
||||||
|
|
||||||
|
n_retries = 0
|
||||||
|
|
||||||
|
while n_retries < 5:
|
||||||
|
time.sleep(2**n_retries - 1)
|
||||||
|
try:
|
||||||
|
response = requests.get(url=url, timeout=15)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response
|
||||||
|
else:
|
||||||
|
n_retries += 1
|
||||||
|
except Exception:
|
||||||
|
n_retries += 1
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Maximum number of retries reached for GET request with url {url}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def process_article(soup):
|
||||||
|
"""Extract relevant information from HTML of o9a.org article"""
|
||||||
|
|
||||||
|
content_soup = soup.select_one("div#content")
|
||||||
|
content = content_soup.text.strip()
|
||||||
|
|
||||||
|
tags_split = content.split("| Tags: ")
|
||||||
|
if len(tags_split) > 1:
|
||||||
|
tags = tags_split[1].split(" | ")[0].split(", ")
|
||||||
|
else:
|
||||||
|
tags = []
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"wayback_url": soup.find("link", {"rel": "canonical"})["href"],
|
||||||
|
"title": content_soup.find("h1").text,
|
||||||
|
"author": content.split("| Author: ")[1].split(" | ")[0],
|
||||||
|
"date": content.split("Posted: ")[1].split(" | ")[0],
|
||||||
|
"content": content.lower(),
|
||||||
|
"links": [a["href"] for a in content_soup.find_all("a", href=True)],
|
||||||
|
"tags": tags,
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Get all archived pages from the o9a.org website, store in DataFrame
|
||||||
|
capture_list = []
|
||||||
|
page = 0
|
||||||
|
out_of_pages = False
|
||||||
|
while not out_of_pages:
|
||||||
|
params = {
|
||||||
|
"page": page,
|
||||||
|
"url": f"o9a.org/*",
|
||||||
|
"output": "json",
|
||||||
|
}
|
||||||
|
r = requests.get(url=BASE_URL, params=params)
|
||||||
|
if r.text == "":
|
||||||
|
out_of_pages = True
|
||||||
|
break
|
||||||
|
result = r.json()
|
||||||
|
capture_list.append(pd.DataFrame(data=result[1:], columns=result[0]))
|
||||||
|
page += 1
|
||||||
|
captures = pd.concat(capture_list)
|
||||||
|
captures["datetime"] = pd.to_datetime(captures["timestamp"])
|
||||||
|
captures["url"] = captures.apply(get_wayback_url, axis="columns")
|
||||||
|
captures = (
|
||||||
|
captures[captures["statuscode"] == "200"]
|
||||||
|
.sort_values("timestamp")
|
||||||
|
.reset_index(drop=True)
|
||||||
|
)
|
||||||
|
captures["processed_url"] = captures["original"].apply(process_archived_url)
|
||||||
|
captures = captures.drop_duplicates(subset="processed_url", keep="first")
|
||||||
|
|
||||||
|
# Get all URLs from the o9a.org sitemap, compare with archived URLs to find deleted articles
|
||||||
|
r = requests.get("https://www.o9a.org/wp-sitemap-posts-post-1.xml")
|
||||||
|
soup = BeautifulSoup(r.content, features="lxml")
|
||||||
|
article_urls_sitemap = set([loc.text.strip("/") for loc in soup.select("url loc")])
|
||||||
|
article_urls_wayback = set(captures["processed_url"].dropna())
|
||||||
|
deleted_urls = article_urls_wayback - article_urls_sitemap
|
||||||
|
urls_to_download = captures[captures["processed_url"].isin(deleted_urls)]["url"]
|
||||||
|
|
||||||
|
# Download all deleted pages, process into DataFrame and save to CSV
|
||||||
|
article_data = []
|
||||||
|
for url in urls_to_download:
|
||||||
|
r = _get(url)
|
||||||
|
soup = BeautifulSoup(r.content, features="lxml")
|
||||||
|
article_data.append(process_article(soup))
|
||||||
|
articles = pd.DataFrame(article_data)
|
||||||
|
articles["date"] = pd.to_datetime(articles["date"])
|
||||||
|
articles = articles.sort_values("date").reset_index(drop=True)
|
||||||
|
articles.to_csv(path_or_buf=OUTPUT_CSV, index=False, quoting=2)
|
||||||
|
|
||||||
|
# Search for specified search term in deleted article contents
|
||||||
|
relevant_articles = articles[
|
||||||
|
articles["content"].str.contains(SEARCH_TERM, na=False)
|
||||||
|
][["date", "wayback_url"]]
|
||||||
|
for date, url in relevant_articles.values:
|
||||||
|
print(date, url)
|
||||||
68
get_ebay_seller_revenue.py
Normal file
68
get_ebay_seller_revenue.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""Estimate the total revenue of a given Ebay seller, and identify their most
|
||||||
|
frequently reviewed products"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlencode, quote
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# URL of Ebay's customer feedback API
|
||||||
|
BASE_URL = "https://feedback.ebay.com/fdbk/update_feedback_profile"
|
||||||
|
|
||||||
|
# Username of seller
|
||||||
|
USERNAME = "commandantcultus"
|
||||||
|
|
||||||
|
# Nested dict of parameters in API query
|
||||||
|
PARAMS = {
|
||||||
|
"url": {
|
||||||
|
"username": USERNAME,
|
||||||
|
"filter": "feedback_page:RECEIVED_AS_SELLER",
|
||||||
|
"limit": "200",
|
||||||
|
},
|
||||||
|
"module": {"modules": "FEEDBACK_SUMMARY"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_review(review):
|
||||||
|
"""Extract relevant fields from raw JSON response for one review"""
|
||||||
|
|
||||||
|
item = review["feedbackInfo"]["item"]
|
||||||
|
item_text = item["itemSummary"]["textSpans"][0]["text"]
|
||||||
|
name, item_id = item_text.split(" (#")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"id": int(item_id.strip(")")),
|
||||||
|
"price": float(item["itemPrice"]["textSpans"][0]["text"].replace("US $", "")),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Fetch data from Ebay API, convert into DataFrame
|
||||||
|
params_str = "&".join(f"{k}={quote(urlencode(v))}" for k, v in PARAMS.items())
|
||||||
|
r = requests.get(url=BASE_URL, params=params_str)
|
||||||
|
review_dicts = r.json()["modules"]["FEEDBACK_SUMMARY"]["feedbackView"][
|
||||||
|
"feedbackCards"
|
||||||
|
]
|
||||||
|
reviews = pd.DataFrame(
|
||||||
|
[process_review(review_dict) for review_dict in review_dicts]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fetch total number of sales (should be 581 as of May 2023)
|
||||||
|
r = requests.get(f"https://www.ebay.com/usr/{USERNAME}")
|
||||||
|
soup = BeautifulSoup(r.content, features="lxml")
|
||||||
|
total_reviews = int(
|
||||||
|
soup.select("div.str-seller-card__stats-content > div[title]")[1][
|
||||||
|
"title"
|
||||||
|
].split(" ")[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Estimate seller's total revenue
|
||||||
|
estimated_revenue = reviews["price"].mean() * total_reviews
|
||||||
|
print(f"Estimated revenue of seller: ${estimated_revenue:.2f}")
|
||||||
|
|
||||||
|
# Identify 5 most frequently reviewed items
|
||||||
|
print("Most reviewed items:")
|
||||||
|
print(Counter(reviews["name"]).most_common(5))
|
||||||
60
get_in_stock_stores.py
Normal file
60
get_in_stock_stores.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
"""Find all Barnes & Noble stores with a specified product in-stock
|
||||||
|
"""
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Base URL for Barnes & Noble product availability API
|
||||||
|
BASE_URL = "https://www.barnesandnoble.com/xhr/storeList-with-prodAvailability.jsp"
|
||||||
|
|
||||||
|
# Stock-keeping unit number for specific book
|
||||||
|
SKU_ID = 9780692306581
|
||||||
|
|
||||||
|
# Zip code to center search on
|
||||||
|
ZIP_CODE = 75254
|
||||||
|
|
||||||
|
# Radius of search: Barnes & Noble's browser interface only allows maximum of 100 miles
|
||||||
|
SEARCH_RADIUS = 1000
|
||||||
|
|
||||||
|
# Random user-agent
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0.0; 5099D Build/O00623) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.68 Mobile Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_store(store_soup):
|
||||||
|
"""Extract relevant information from HTML of a given store"""
|
||||||
|
|
||||||
|
in_stock_str = store_soup.select_one("div.item-in-stock").text.strip()
|
||||||
|
if in_stock_str == "In Stock in Store":
|
||||||
|
in_stock = True
|
||||||
|
elif in_stock_str == "Not in Stock in Store":
|
||||||
|
in_stock = False
|
||||||
|
else:
|
||||||
|
in_stock = in_stock_str
|
||||||
|
|
||||||
|
return {
|
||||||
|
"store": store_soup.select_one(
|
||||||
|
"div.store-details-container > div.store-address"
|
||||||
|
).text.strip(),
|
||||||
|
"in_stock": in_stock,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Initialize query parameters
|
||||||
|
params = {
|
||||||
|
"action": "fromSearch",
|
||||||
|
"radius": SEARCH_RADIUS,
|
||||||
|
"searchString": ZIP_CODE,
|
||||||
|
"skuId": SKU_ID,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make API query to determine which (if any) stores have the book in-stock
|
||||||
|
r = requests.get(url=BASE_URL, params=params, headers=HEADERS)
|
||||||
|
soup = BeautifulSoup(r.text, features="lxml")
|
||||||
|
stores = soup.select("div.store-list")
|
||||||
|
results = [process_store(store) for store in stores]
|
||||||
|
|
||||||
|
# Print addresses of stores with book in-stock
|
||||||
|
[print(result["store"]) for result in results if result["in_stock"]]
|
||||||
128
get_ingram_o9a_books.py
Normal file
128
get_ingram_o9a_books.py
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
"""Get information about a list of in-stock O9A books from Ingram's catalog.
|
||||||
|
This provides the same information as is available using Ingram's Stock Check app
|
||||||
|
https://www.ingramcontent.com/retailers/independent-bookstores/stock-check-app"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Base URL of Ingram API
|
||||||
|
BASE_URL = "https://ipage.ingramcontent.com/ipage/ws/1/mobile"
|
||||||
|
|
||||||
|
# List of EAN (European Article Number) codes for books associated with O9A
|
||||||
|
EANS = [
|
||||||
|
"9780692306581",
|
||||||
|
"9780997836363",
|
||||||
|
"9780692575505",
|
||||||
|
"9781494440954",
|
||||||
|
"9780692260845",
|
||||||
|
"9780692548127",
|
||||||
|
"9780692723920",
|
||||||
|
"9780999768006",
|
||||||
|
"9781696821742",
|
||||||
|
"9781687255624",
|
||||||
|
"9781689931953",
|
||||||
|
"9780997836370",
|
||||||
|
"9780999768044",
|
||||||
|
"9780999768020",
|
||||||
|
"9780997836387",
|
||||||
|
"9780997836356",
|
||||||
|
"9780997836349",
|
||||||
|
"9780997836325",
|
||||||
|
"9780997836301",
|
||||||
|
"9780997836318",
|
||||||
|
"9780692667293",
|
||||||
|
"9780692510711",
|
||||||
|
"9780692484463",
|
||||||
|
"9780692432082",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Columns in API response to store
|
||||||
|
RELEVANT_COLUMNS = [
|
||||||
|
"primaryContributorName",
|
||||||
|
"isbn",
|
||||||
|
"title",
|
||||||
|
"primaryProductType",
|
||||||
|
"displayableFormat",
|
||||||
|
"sortableTitle",
|
||||||
|
"ean",
|
||||||
|
"primaryBisacCategory",
|
||||||
|
"publisher",
|
||||||
|
"retailPrice",
|
||||||
|
"totalOnHand",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Write information about each book to this file
|
||||||
|
OUTPUT_CSV = "o9a_books.csv"
|
||||||
|
|
||||||
|
|
||||||
|
class IngramClient:
|
||||||
|
"""Class to search Ingram's free (mobile) API."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.token = self.get_token()
|
||||||
|
|
||||||
|
def get_token(self):
|
||||||
|
"""Initialize access token, which is necessary for all API queries"""
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"email_address": "email@address.com", # email address doesn't seem to matter
|
||||||
|
"terms_accepted": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
r = requests.post(url=BASE_URL + "/register", json=params)
|
||||||
|
return r.json()["token"]
|
||||||
|
|
||||||
|
def search(self, keywords):
|
||||||
|
"""Search Ingram's book catalog for a given keyword or EAN"""
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
page_number = 1
|
||||||
|
while True:
|
||||||
|
params = {
|
||||||
|
"keywords": keywords,
|
||||||
|
"token": self.token,
|
||||||
|
"page_number": page_number,
|
||||||
|
}
|
||||||
|
r = requests.get(url=BASE_URL + "/search", params=params)
|
||||||
|
if not (results := r.json().get("results")):
|
||||||
|
break
|
||||||
|
|
||||||
|
enhanced_results = []
|
||||||
|
for result in results:
|
||||||
|
additional_info = self.get_stock(ean=result["ean"])
|
||||||
|
result.update(additional_info)
|
||||||
|
enhanced_results.append(result)
|
||||||
|
all_results.extend(enhanced_results)
|
||||||
|
|
||||||
|
page_number += 1
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
def get_stock(self, ean):
|
||||||
|
"""Query how many copies of a given Ingram product are in stock"""
|
||||||
|
|
||||||
|
params = {"product_code": ean, "token": self.token}
|
||||||
|
r = requests.get(url=BASE_URL + "/stockcheck", params=params)
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
|
def process_book(book):
|
||||||
|
"""Extract relevant fields from Ingram API response, aggregate number of books in-stock"""
|
||||||
|
|
||||||
|
processed_book = {k: v for k, v in book.items() if k in RELEVANT_COLUMNS}
|
||||||
|
processed_book["contributors"] = ", ".join(
|
||||||
|
c["displayName"] for c in book["contributors"]
|
||||||
|
)
|
||||||
|
processed_book["totalOnOrder"] = sum(c["count"] for c in book["onOrder"].values())
|
||||||
|
return processed_book
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Initialize client, fetch information about all specified books and store in DataFrame
|
||||||
|
client = IngramClient()
|
||||||
|
valid_books = []
|
||||||
|
for ean in EANS:
|
||||||
|
valid_books.extend(client.search(keywords=ean))
|
||||||
|
df = pd.DataFrame([process_book(book) for book in valid_books])
|
||||||
|
|
||||||
|
# Write DataFrame to CSV file
|
||||||
|
df.to_csv(path_or_buf=OUTPUT_CSV, index=False, quoting=2)
|
||||||
Reference in New Issue
Block a user