from requests.auth import HTTPBasicAuth import requests import pandas as pd import sys from IPython.display import clear_output import time import numpy as np import collections from datetime import datetime import math # from GoogleNews import GoogleNews import random access_token = "" username = access_token password = "" size = "5000" basic = HTTPBasicAuth(username, password) class Ownership_Network: def __init__(self, officer_id=None, company_id=None, address=None): self.addresses = pd.DataFrame(columns=['address','n']) self.officer_ids = pd.DataFrame(columns=['officer_id','n']) self.company_ids = pd.DataFrame(columns=['company_id','n']) self.companies = pd.DataFrame(columns=['company_number','n']) self.officer_id = officer_id self.company_id = company_id self.address = address self.n = 0 self.edge = "Origin" self.initialise_dataframe() def initialise_dataframe(self): if self.officer_id: self.officer_ids = self.officer_ids.append({'officer_id': self.officer_id, 'name': get_appointments(self.officer_id)[0]['name'], 'n':self.n, 'edge':self.edge, 'node': None, 'node_type': 'Person'}, ignore_index=True) elif self.company_id: self.company_ids = self.company_ids.append({'company_id': self.company_id, 'n':self.n, 'edge':self.edge, 'node': None, 'node_type': 'Company'}, ignore_index=True) company = get_company(self.company_id) company['n'] = self.n company['edge'] = self.edge self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) elif self.address: self.addresses = self.addresses.append({'address': self.address, 'n':self.n, 'edge':self.edge, 'node': None, 'node_type': 'Address'}, ignore_index=True) else: print("no input provided") def search_officer_id(self, officer_id): appointments = get_appointments(officer_id) self.node_type = "Person" self.node = officer_id for appointment in appointments: if normalise_address(appointment['address']) not in self.addresses['address'].unique(): self.edge = "Appointment Address" self.addresses = self.addresses.append({'address': normalise_address(appointment['address']), 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) if appointment['appointed_to']['company_number'] not in self.company_ids['company_id'].unique(): self.edge = "Appointment" self.company_ids = self.company_ids.append({'company_id': appointment['appointed_to']['company_number'], 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) # company = get_company(appointment['appointed_to']['company_number']) # company['n'] = self.n # company['edge'] = self.edge # company['node'] = self.node # company['node_type'] = self.node_type # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) correspondance_address = get_correspondance_address(officer_id) if normalise_address(correspondance_address) not in self.addresses['address'].unique(): self.edge = "Officer Corresponance Address" self.addresses = self.addresses.append({'address': normalise_address(correspondance_address), 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) duplicate_officers = get_duplicate_officers(officer_id) for duplicate in duplicate_officers: self.edge = "Duplicate Officer" if duplicate['links']['self'].split('/')[2] not in self.officer_ids['officer_id'].unique(): self.officer_ids = self.officer_ids.append({'officer_id': duplicate['links']['self'].split('/')[2], 'name': duplicate['title'], 'n':self.n, 'edge': self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) def normalise_name(name): name_list = name.replace(',','').split(" ") name_list.insert(0, name_list.pop()) return ' '.join(name_list) def search_company_id(self, company_id): officers = get_officers(company_id) self.node_type = "Company" self.node = company_id if officers: for officer in officers: if normalise_address(officer['address']) not in self.addresses['address'].unique(): self.edge = "Officer Corresponance Address" self.addresses = self.addresses.append({'address': normalise_address(officer['address']), 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) if officer['links']['officer']['appointments'].split('/')[2] not in self.officer_ids['officer_id'].unique(): self.edge = "Officer" self.officer_ids = self.officer_ids.append({'officer_id': officer['links']['officer']['appointments'].split('/')[2], 'name': normalise_name(officer['name']), 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) psc = get_psc(company_id) if psc: for person in psc: if "address" in person: self.edge = "Person of Significant Control Address" if normalise_address(person['address']) not in self.addresses['address'].unique(): self.addresses = self.addresses.append({'address': normalise_address(person['address']), 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) address_history = build_address_history(company_id) for address in address_history: self.edge = "Company Historical Address" if address['address'] not in self.addresses['address'].unique(): self.addresses = self.addresses.append({'address': address['address'], 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) def search_address(self, address): companies = get_companies_at_address(address) self.node_type = "Address" self.node = address if companies: for company in companies: self.edge = "Company Address" if company['company_number'] not in self.company_ids['company_id'].unique(): self.company_ids = self.company_ids.append({'company_id': company['company_number'], 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) # company = get_company(company['company_number']) # if company: # company['n'] = self.n # company['edge'] = self.edge # company['node'] = self.node # company['node_type'] = self.node_type # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) officers = get_officers_at_location(address) for officer in officers: self.edge = "Officer at Address" if officer['links']['self'].split('/')[2] not in self.officer_ids['officer_id'].unique(): self.officer_ids = self.officer_ids.append({'officer_id': officer['links']['self'].split('/')[2], 'name': officer['title'], 'n':self.n, 'edge':self.edge, 'node': self.node, 'node_type': self.node_type}, ignore_index=True) def get_company_from_id(self, company_id=None): company_list = [] if company_id: if company_id in self.company_ids['company_id'].unique(): company_list = [company_id] else: print("add valid company id") else: company_list = self.company_ids['company_id'].unique() for company_id in company_list: if company_id not in self.companies['company_number'].unique(): company = get_company(company_id) if company: company['n'] = self.company_ids.loc[self.company_ids['company_id'] == company_id]['n'] company['edge'] = self.company_ids.loc[self.company_ids['company_id'] == company_id]['edge'] company['node'] = self.company_ids.loc[self.company_ids['company_id'] == company_id]['node'] company['node_type'] = self.company_ids.loc[self.company_ids['company_id'] == company_id]['node_type'] self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) def hop(self, hops): for hop in range(hops): print("hop: " + str(hop+1)) self.n += 1 selected_addresses = self.addresses.loc[self.addresses['n'] == self.n-1]['address'] selected_companies = self.company_ids.loc[self.company_ids['n'] == self.n-1]['company_id'] selected_officers = self.officer_ids.loc[self.officer_ids['n'] == self.n-1]['officer_id'] for i,address in enumerate(selected_addresses): self.search_address(address) clear_output(wait=True) print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses") for j,company in enumerate(selected_companies): self.search_company_id(company) clear_output(wait=True) print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies") for k,officer in enumerate(selected_officers): self.search_officer_id(officer) clear_output(wait=True) print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers") def find_path(self, select_company): select_row = self.company_ids.loc[self.company_ids['company_id'] == select_company] path = [] self.get_company_from_id(company_id=select_company) backlink = self.companies[self.companies["company_number"] == select_company]['company_name'].item() + " (" + select_row['edge'].item() + ") " path.insert(0, backlink) while True: if select_row['node_type'].item() == "Address": select_row = self.addresses.loc[self.addresses['address'] == select_row['node'].item()] if select_row['edge'].item() == "Origin": path.insert(0, select_row['address'].item() + " ->") break else: backlink = select_row['address'].item() + " (" + select_row['edge'].item() + ") " + "->" path.insert(0, backlink) elif select_row['node_type'].item() == "Company": select_row = self.company_ids.loc[self.company_ids['company_id'] == select_row['node'].item()] self.get_company_from_id(company_id=select_row['company_id'].item()) if select_row['edge'].item() == "Origin": path.insert(0,self.companies[self.companies["company_number"] == select_row['company_id'].item()]['company_name'].item()+ " ->") break else: backlink = self.companies[self.companies["company_number"] == select_row['company_id'].item()]['company_name'].item() + " (" + select_row['edge'].item() + ") " + "->" path.insert(0, backlink) elif select_row['node_type'].item() == "Person": select_row = self.officer_ids.loc[self.officer_ids['officer_id'] == select_row['node'].item()] if select_row['edge'].item() == "Origin": path.insert(0, select_row["name"].item() + " ->") break else: backlink = str(select_row['name'].item()) + " (" + str(select_row['edge'].item()) + ") " + "->" path.insert(0, backlink) else: print("error") break print(' '.join(path)) def get_appointments(officer_id): url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments?size=" + size time.sleep(0.5) response = requests.get(url, auth=basic) # print metadata return response.json()['items'] def get_correspondance_address(officer_id): url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments?size=" + size time.sleep(0.5) response = requests.get(url, auth=basic) return response.json()['items'][0]['address'] def get_duplicate_officers(officer_id): url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments?size=5000" response = requests.get(url, auth=basic) officer_data = response.json() officer_self_link = response.json()['links']['self'] name_list = officer_data['name'].replace(',','').split(' ') name = " ".join(name_list[1:]) + " " + name_list[0] # search officers with same name url = "https://api.company-information.service.gov.uk/search/officers?q=" + name try: time.sleep(0.5) response = requests.get(url, auth=basic) # filter offices with same birthday as search query officer filtered_results = [] if 'items' in response.json(): for officer in response.json()['items']: if 'date_of_birth' in officer.keys() and 'date_of_birth' in officer_data.keys(): if officer['date_of_birth'] == officer_data['date_of_birth'] and officer['links']['self'] != officer_self_link: filtered_results.append(officer) return filtered_results else: return except requests.exceptions.RequestException as e: # This is the correct syntax raise SystemExit(e) def get_psc(company_id): url = "https://api.company-information.service.gov.uk/company/" + company_id + "/persons-with-significant-control" try: time.sleep(0.5) response = requests.get(url, auth=basic) if response.status_code == 200: return response.json()['items'] else: return except requests.exceptions.RequestException as e: # This is the correct syntax raise SystemExit(e) def get_company(company_id): url = "https://api.company-information.service.gov.uk/company/" + company_id try: time.sleep(0.5) response = requests.get(url, auth=basic) if response.status_code == 200: return response.json() else: print(response.status_code) return except requests.exceptions.RequestException as e: raise SystemExit(e) def get_address_changes(company_id): url = "https://api.company-information.service.gov.uk/company/" + str(company_id) + "/filing-history/?category=address" try: time.sleep(0.5) # test here to see if page has been found response = requests.get(url, auth=basic) if response.status_code == 200: if 'items' in response.json(): return response.json() else: return except requests.exceptions.RequestException as e: # This is the correct syntax raise SystemExit(e) def get_company_info(company_id): url = "https://api.company-information.service.gov.uk/company/" + str(company_id) try: time.sleep(0.5) # test here to see if page has been found response = requests.get(url, auth=basic) if response.json(): return response.json() else: return except requests.exceptions.RequestException as e: # This is the correct syntax raise SystemExit(e) def normalise_name(name): name_list = name.replace(',','').split(" ") name_list.insert(0, name_list.pop()) return ' '.join(name_list) def process_address_changes(address_changes): # fill in missing new address values: for i in reversed(range(1,len(address_changes['items']))): if 'new_address' not in address_changes['items'][i]['description_values'].keys(): if 'old_address' in address_changes['items'][i-1]['description_values'].keys(): address_changes['items'][i]['description_values']['new_address'] = address_changes['items'][i-1]['description_values']['old_address'] # df = pd.json_normalize(address_changes['items']) return address_changes def build_address_history(company_id): company_info = get_company_info(company_id) company_info_subset = {k:company_info[k] for k in ("date_of_creation","date_of_cessation","registered_office_address") if k in company_info} address_changes = get_address_changes(company_id) address_keys = ('start_date','end_date','address') if address_changes['items']: address_changes = process_address_changes(address_changes) ### addresses = [] entry = {} entry["address"] = str(normalise_address(company_info_subset['registered_office_address'])) entry["start_date"] = str(address_changes['items'][0]['date']) if 'date_of_cessation' in company_info_subset: entry["end_date"] = str(company_info_subset['date_of_cessation']) else: entry["end_date"] = None addresses.append(entry) for i,change in enumerate(address_changes['items']): entry = {} if 'old_address' in change['description_values']: entry["address"] = change['description_values']['old_address'] else: entry["address"] = "" if i+1 < len(address_changes['items']): entry["start_date"] = str(address_changes['items'][i+1]['date']) else: entry["start_date"] = company_info_subset['date_of_creation'] entry["end_date"] = str(change['date']) addresses.append(entry) return addresses else: address_history = [] entry = {} for k, key in enumerate(["date_of_creation","date_of_cessation","registered_office_address"]): if key in company_info: entry[address_keys[k]] = company_info[key] else: entry[address_keys[k]] = None entry['address'] = normalise_address(entry['address']) return [entry] def normalise_address(address_dict): address_list = [] for key in ['premises','address_line_1', 'locality','postal_code', 'country']: if key in address_dict: address_list.append(address_dict[key]) address_string = ' '.join(address_list) return address_string def get_news(df): company_news = [] full_name_news = [] short_name_news = [] searched = {} for index, row in df.iterrows(): time.sleep(random.uniform(0, 1)) company_name = row['company_name'] full_name = row['name'] if type(row["name_elements"]) == dict: short_name = '"' + row["name_elements"]["forename"] + " " + row["name_elements"]["surname"] + '"' else: short_name = '"' + row["name_elements"] + '"' # add a check ... if company_name in searched: company_news.append(searched[company_name]) else: searched[company_name] = company_news_check(company_name) company_news.append(searched[company_name]) if full_name in searched: full_name_news.append(searched[full_name]) else: searched[full_name] = company_news_check(full_name) full_name_news.append(searched[full_name]) if short_name in searched: short_name_news.append(searched[short_name]) else: searched[short_name] = company_news_check(short_name) short_name_news.append(searched[short_name]) progress = str(int(100*index/len(df)))+"%" print(progress) df['company_news'] = company_news df['full_name_news'] = full_name_news df['short_name_news'] = short_name_news return df def company_news_check(search_term): time.sleep(random.uniform(0, 0.2)) googlenews = GoogleNews(period='10y') news = [] googlenews.get_news('"' + str(search_term) + '"') for story in googlenews.results(): if story['title'] not in news: news += [story['title']] return news def get_locations(companies, address_type: str): df = companies if address_type == "correspondance": addresses = [] for address in df['address']: address_string_list = [] for key in ['premises','address_line_1', 'locality', 'country','postal_code']: if key in address: address_string_list.append(address[key]) address_string = ', '.join(address_string_list) addresses += [address_string] elif address_type == "registered": addresses = [] keys = ["address_line_1","address_line_2","country","locality","postal_code"] for link in df['links']: url = "https://api.company-information.service.gov.uk" + link['company'] + "/registered-office-address" time.sleep(0.5) response = requests.get(url, auth=basic) address = [] postcode = [] for key in keys: if key in response.json(): address += [response.json()[key]] if key == "postal_code": postcode += [response.json()[key]] address = ", ".join(address) addresses += [address] else: print("unrecognised address type: should be either corresponance or registered") return None postcode_frequency = dict(collections.Counter(postcode).items(), key=lambda item: item[1], reverse=True) print(str(len(postcode_frequency)) + " unique postcodes") frequency = dict(sorted(collections.Counter(addresses).items(), key=lambda item: item[1], reverse=True)) print(str(len(frequency)) + " unique " + address_type + " addresses") print(frequency) return addresses def remove_company_type(company_name): split_name = company_name.split(" ") if split_name[-1] in ["LIMITED","LTD","LTD.","PLC","LLP","RTM","CIC","CASC"]: return " ".join(split_name[:-1]) else: return company_name def year_of_creation(companies): years = [address['date_of_creation'][0:4] for address in companies] frequency = collections.Counter(years) return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True)) def age(creation: str, cessation: str): delta = datetime.strptime(cessation, "%Y-%m-%d")-datetime.strptime(creation, "%Y-%m-%d") return math.floor(delta.days/365) def get_companies_at_address(address): companies = {} companies_summary = {} url = "https://api.company-information.service.gov.uk/advanced-search/companies?location=" + address + "&size=" + "50" time.sleep(0.5) response = requests.get(url, auth=basic) if response.status_code == 200: # this is what we want in a dataframe: return response.json()['items'] def company_summary(df): registered_companies = len(df) active_companies = df['company_status'].value_counts().get('active') dissolved_companies = df['company_status'].value_counts().get('dissolved') liquidated_companies = df['company_status'].value_counts().get('liquidation') administration_companies = df['company_status'].value_counts().get('administration') recievership_companies = df['company_status'].value_counts().get('receivership') insolvent_companies = df['company_status'].value_counts().get('insolvency-proceedings') active_creation = df.loc[df['company_status'] == 'active']['year_of_creation'].value_counts()[0:3] if len(active_creation) < 3: active = len(active_creation) else: active = 3 print(df["address"][0]) print(str(active_companies) + " active companies") print(str(len(df)) + " companies registered") for i in range(active): print(str(active_creation[i]) + " active companies created in " + active_creation.keys()[i]) # 3 most common periods of company survival in years print(str(dissolved_companies) + " dissolved companies") print(str(liquidated_companies) + " liquidated companies") print(str(administration_companies) + " companies in administration") print(str(recievership_companies) + " companies in recievership") print(str(insolvent_companies) + " companies in insolvency") survival = df['survival_years'].value_counts() if len(survival) > 0: if len(survival) < 3: survive = len(survival) else: survive = 3 for i in range(survive): key = int(df['survival_years'].value_counts().keys()[i]) print(str(df['survival_years'].value_counts()[key]) + " companies lasted " + str(int(key)) + "-" + str(int(key+1)) + " years") def get_officers_at_location(location): url = "https://api.company-information.service.gov.uk/search/officers" + "?q=location:" + location time.sleep(0.5) response = requests.get(url, auth=basic) if response.status_code == 200: # filter json officers = [] word_list = [] for word in location.replace(',','').split(): word_list.append(word) for officer in response.json()['items']: if all(word in officer['address_snippet'] for word in word_list): officers.append(officer) return officers def get_officers(company_id): url = "https://api.company-information.service.gov.uk/company/" + company_id + "/officers" time.sleep(0.5) response = requests.get(url, auth=basic) if response.status_code == 200: return response.json()['items']