initial commit

2026-06-07 19:18:35 +03:00 · 2023-05-26 05:15:11 -05:00
commit a14b3a8c7a
4 changed files with 402 additions and 0 deletions
--- a/get_deleted_o9a_articles.py
+++ b/get_deleted_o9a_articles.py
@@ -0,0 +1,146 @@
 """Identify all articles from o9a.org that have been captured on Wayback Machine but 
 have since been deleted from the o9a.org website, and search for articles containing
 the term "tempel".
 """
 import re
 import time
 import requests
 import pandas as pd
 from bs4 import BeautifulSoup
 # URL of Wayback Machine search API
 BASE_URL = "http://web.archive.org/cdx/search/cdx"
 # If archived URL contains these strings, ignore it
 IGNORE_URL_STRINGS = ["wp-json", "wp-content", "/uploads/", "/page/"]
 # Search articles for this string
 SEARCH_TERM = "tempel"
 # Save table of information for deleted articles to this file
 OUTPUT_CSV = "o9a_deleted_articles.csv"
 def get_wayback_url(row):
    """Convert timestamp and original URL into Wayback Machine URL"""
    return f'https://web.archive.org/web/{row["timestamp"]}/{row["original"]}'
 def process_archived_url(url):
    """Convert original archived URL to standard form"""
    if "/20" not in url:
        return None
    _url = url.split("?")[0]
    _url = _url.replace(".org:80/", ".org/").replace("http://", "https://").strip("/")
    if any(s in _url for s in IGNORE_URL_STRINGS):
        return None
    if re.match("^https://www.o9a.org/\d{4}/\d{2}$", _url):
        return None
    if _url.endswith("/embed"):
        return None
    return _url
 def _get(url):
    """Wrapper for retrying request multiple times."""
    n_retries = 0
    while n_retries < 5:
        time.sleep(2**n_retries - 1)
        try:
            response = requests.get(url=url, timeout=15)
            if response.status_code == 200:
                return response
            else:
                n_retries += 1
        except Exception:
            n_retries += 1
    raise ValueError(
        f"Maximum number of retries reached for GET request with url {url}"
    )
 def process_article(soup):
    """Extract relevant information from HTML of o9a.org article"""
    content_soup = soup.select_one("div#content")
    content = content_soup.text.strip()
    tags_split = content.split("| Tags: ")
    if len(tags_split) > 1:
        tags = tags_split[1].split(" | ")[0].split(", ")
    else:
        tags = []
    data = {
        "wayback_url": soup.find("link", {"rel": "canonical"})["href"],
        "title": content_soup.find("h1").text,
        "author": content.split("| Author: ")[1].split(" | ")[0],
        "date": content.split("Posted: ")[1].split(" | ")[0],
        "content": content.lower(),
        "links": [a["href"] for a in content_soup.find_all("a", href=True)],
        "tags": tags,
    }
    return data
 if __name__ == "__main__":
    # Get all archived pages from the o9a.org website, store in DataFrame
    capture_list = []
    page = 0
    out_of_pages = False
    while not out_of_pages:
        params = {
            "page": page,
            "url": f"o9a.org/*",
            "output": "json",
        }
        r = requests.get(url=BASE_URL, params=params)
        if r.text == "":
            out_of_pages = True
            break
        result = r.json()
        capture_list.append(pd.DataFrame(data=result[1:], columns=result[0]))
        page += 1
    captures = pd.concat(capture_list)
    captures["datetime"] = pd.to_datetime(captures["timestamp"])
    captures["url"] = captures.apply(get_wayback_url, axis="columns")
    captures = (
        captures[captures["statuscode"] == "200"]
        .sort_values("timestamp")
        .reset_index(drop=True)
    )
    captures["processed_url"] = captures["original"].apply(process_archived_url)
    captures = captures.drop_duplicates(subset="processed_url", keep="first")
    # Get all URLs from the o9a.org sitemap, compare with archived URLs to find deleted articles
    r = requests.get("https://www.o9a.org/wp-sitemap-posts-post-1.xml")
    soup = BeautifulSoup(r.content, features="lxml")
    article_urls_sitemap = set([loc.text.strip("/") for loc in soup.select("url loc")])
    article_urls_wayback = set(captures["processed_url"].dropna())
    deleted_urls = article_urls_wayback - article_urls_sitemap
    urls_to_download = captures[captures["processed_url"].isin(deleted_urls)]["url"]
    # Download all deleted pages, process into DataFrame and save to CSV
    article_data = []
    for url in urls_to_download:
        r = _get(url)
        soup = BeautifulSoup(r.content, features="lxml")
        article_data.append(process_article(soup))
    articles = pd.DataFrame(article_data)
    articles["date"] = pd.to_datetime(articles["date"])
    articles = articles.sort_values("date").reset_index(drop=True)
    articles.to_csv(path_or_buf=OUTPUT_CSV, index=False, quoting=2)
    # Search for specified search term in deleted article contents
    relevant_articles = articles[
        articles["content"].str.contains(SEARCH_TERM, na=False)
    ][["date", "wayback_url"]]
    for date, url in relevant_articles.values:
        print(date, url)
--- a/get_ebay_seller_revenue.py
+++ b/get_ebay_seller_revenue.py
@@ -0,0 +1,68 @@
 """Estimate the total revenue of a given Ebay seller, and identify their most
 frequently reviewed products"""
 import requests
 from urllib.parse import urlencode, quote
 from collections import Counter
 from bs4 import BeautifulSoup
 import pandas as pd
 # URL of Ebay's customer feedback API
 BASE_URL = "https://feedback.ebay.com/fdbk/update_feedback_profile"
 # Username of seller
 USERNAME = "commandantcultus"
 # Nested dict of parameters in API query
 PARAMS = {
    "url": {
        "username": USERNAME,
        "filter": "feedback_page:RECEIVED_AS_SELLER",
        "limit": "200",
    },
    "module": {"modules": "FEEDBACK_SUMMARY"},
 }
 def process_review(review):
    """Extract relevant fields from raw JSON response for one review"""
    item = review["feedbackInfo"]["item"]
    item_text = item["itemSummary"]["textSpans"][0]["text"]
    name, item_id = item_text.split(" (#")
    return {
        "name": name,
        "id": int(item_id.strip(")")),
        "price": float(item["itemPrice"]["textSpans"][0]["text"].replace("US $", "")),
    }
 if __name__ == "__main__":
    # Fetch data from Ebay API, convert into DataFrame
    params_str = "&".join(f"{k}={quote(urlencode(v))}" for k, v in PARAMS.items())
    r = requests.get(url=BASE_URL, params=params_str)
    review_dicts = r.json()["modules"]["FEEDBACK_SUMMARY"]["feedbackView"][
        "feedbackCards"
    ]
    reviews = pd.DataFrame(
        [process_review(review_dict) for review_dict in review_dicts]
    )
    # Fetch total number of sales (should be 581 as of May 2023)
    r = requests.get(f"https://www.ebay.com/usr/{USERNAME}")
    soup = BeautifulSoup(r.content, features="lxml")
    total_reviews = int(
        soup.select("div.str-seller-card__stats-content > div[title]")[1][
            "title"
        ].split("  ")[0]
    )
    # Estimate seller's total revenue
    estimated_revenue = reviews["price"].mean() * total_reviews
    print(f"Estimated revenue of seller: ${estimated_revenue:.2f}")
    # Identify 5 most frequently reviewed items
    print("Most reviewed items:")
    print(Counter(reviews["name"]).most_common(5))
--- a/get_in_stock_stores.py
+++ b/get_in_stock_stores.py
@@ -0,0 +1,60 @@
 """Find all Barnes & Noble stores with a specified product in-stock
 """
 from bs4 import BeautifulSoup
 import requests
 # Base URL for Barnes & Noble product availability API
 BASE_URL = "https://www.barnesandnoble.com/xhr/storeList-with-prodAvailability.jsp"
 # Stock-keeping unit number for specific book
 SKU_ID = 9780692306581
 # Zip code to center search on
 ZIP_CODE = 75254
 # Radius of search: Barnes & Noble's browser interface only allows maximum of 100 miles
 SEARCH_RADIUS = 1000
 # Random user-agent
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 8.0.0; 5099D Build/O00623) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.68 Mobile Safari/537.36"
 }
 def process_store(store_soup):
    """Extract relevant information from HTML of a given store"""
    in_stock_str = store_soup.select_one("div.item-in-stock").text.strip()
    if in_stock_str == "In Stock in Store":
        in_stock = True
    elif in_stock_str == "Not in Stock in Store":
        in_stock = False
    else:
        in_stock = in_stock_str
    return {
        "store": store_soup.select_one(
            "div.store-details-container > div.store-address"
        ).text.strip(),
        "in_stock": in_stock,
    }
 if __name__ == "__main__":
    # Initialize query parameters
    params = {
        "action": "fromSearch",
        "radius": SEARCH_RADIUS,
        "searchString": ZIP_CODE,
        "skuId": SKU_ID,
    }
    # Make API query to determine which (if any) stores have the book in-stock
    r = requests.get(url=BASE_URL, params=params, headers=HEADERS)
    soup = BeautifulSoup(r.text, features="lxml")
    stores = soup.select("div.store-list")
    results = [process_store(store) for store in stores]
    # Print addresses of stores with book in-stock
    [print(result["store"]) for result in results if result["in_stock"]]
--- a/get_ingram_o9a_books.py
+++ b/get_ingram_o9a_books.py
@@ -0,0 +1,128 @@
 """Get information about a list of in-stock O9A books from Ingram's catalog.
 This provides the same information as is available using Ingram's Stock Check app
 https://www.ingramcontent.com/retailers/independent-bookstores/stock-check-app"""
 import requests
 import pandas as pd
 # Base URL of Ingram API
 BASE_URL = "https://ipage.ingramcontent.com/ipage/ws/1/mobile"
 # List of EAN (European Article Number) codes for books associated with O9A
 EANS = [
    "9780692306581",
    "9780997836363",
    "9780692575505",
    "9781494440954",
    "9780692260845",
    "9780692548127",
    "9780692723920",
    "9780999768006",
    "9781696821742",
    "9781687255624",
    "9781689931953",
    "9780997836370",
    "9780999768044",
    "9780999768020",
    "9780997836387",
    "9780997836356",
    "9780997836349",
    "9780997836325",
    "9780997836301",
    "9780997836318",
    "9780692667293",
    "9780692510711",
    "9780692484463",
    "9780692432082",
 ]
 # Columns in API response to store
 RELEVANT_COLUMNS = [
    "primaryContributorName",
    "isbn",
    "title",
    "primaryProductType",
    "displayableFormat",
    "sortableTitle",
    "ean",
    "primaryBisacCategory",
    "publisher",
    "retailPrice",
    "totalOnHand",
 ]
 # Write information about each book to this file
 OUTPUT_CSV = "o9a_books.csv"
 class IngramClient:
    """Class to search Ingram's free (mobile) API."""
    def __init__(self):
        self.token = self.get_token()
    def get_token(self):
        """Initialize access token, which is necessary for all API queries"""
        params = {
            "email_address": "email@address.com",  # email address doesn't seem to matter
            "terms_accepted": True,
        }
        r = requests.post(url=BASE_URL + "/register", json=params)
        return r.json()["token"]
    def search(self, keywords):
        """Search Ingram's book catalog for a given keyword or EAN"""
        all_results = []
        page_number = 1
        while True:
            params = {
                "keywords": keywords,
                "token": self.token,
                "page_number": page_number,
            }
            r = requests.get(url=BASE_URL + "/search", params=params)
            if not (results := r.json().get("results")):
                break
            enhanced_results = []
            for result in results:
                additional_info = self.get_stock(ean=result["ean"])
                result.update(additional_info)
                enhanced_results.append(result)
            all_results.extend(enhanced_results)
            page_number += 1
        return all_results
    def get_stock(self, ean):
        """Query how many copies of a given Ingram product are in stock"""
        params = {"product_code": ean, "token": self.token}
        r = requests.get(url=BASE_URL + "/stockcheck", params=params)
        return r.json()
 def process_book(book):
    """Extract relevant fields from Ingram API response, aggregate number of books in-stock"""
    processed_book = {k: v for k, v in book.items() if k in RELEVANT_COLUMNS}
    processed_book["contributors"] = ", ".join(
        c["displayName"] for c in book["contributors"]
    )
    processed_book["totalOnOrder"] = sum(c["count"] for c in book["onOrder"].values())
    return processed_book
 if __name__ == "__main__":
    # Initialize client, fetch information about all specified books and store in DataFrame
    client = IngramClient()
    valid_books = []
    for ean in EANS:
        valid_books.extend(client.search(keywords=ean))
    df = pd.DataFrame([process_book(book) for book in valid_books])
    # Write DataFrame to CSV file
    df.to_csv(path_or_buf=OUTPUT_CSV, index=False, quoting=2)