diff --git a/.gitignore b/.gitignore index 1d8dd28..4cf359e 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ coverage.xml # Sphinx documentation docs/_build/ + +# Testing notebook +notebooks/testing.ipynb diff --git a/assets/.DS_Store b/assets/.DS_Store new file mode 100644 index 0000000..bf1e5be Binary files /dev/null and b/assets/.DS_Store differ diff --git a/dashboard/.DS_Store b/dashboard/.DS_Store new file mode 100644 index 0000000..bbcf9e3 Binary files /dev/null and b/dashboard/.DS_Store differ diff --git a/notebooks/001_getting_started.ipynb b/notebooks/001_getting_started.ipynb index 3699c2f..a94bff8 100644 --- a/notebooks/001_getting_started.ipynb +++ b/notebooks/001_getting_started.ipynb @@ -542,7 +542,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/notebooks/002_candy_connections.ipynb b/notebooks/002_candy_connections.ipynb index ca77164..622fe8a 100644 --- a/notebooks/002_candy_connections.ipynb +++ b/notebooks/002_candy_connections.ipynb @@ -173,7 +173,7 @@ "metadata": {}, "outputs": [], "source": [ - "# western_crown_network = base.Network(file='western_crown_network.json')" + "western_crown_network = base.Network(file='western_crown_network.json')" ] }, { @@ -605,7 +605,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/notebooks/003_virtual_offices.ipynb b/notebooks/003_virtual_offices.ipynb index 30e718e..d49b347 100644 --- a/notebooks/003_virtual_offices.ipynb +++ b/notebooks/003_virtual_offices.ipynb @@ -258,7 +258,7 @@ "metadata": {}, "outputs": [], "source": [ - "# regent_street_network = base.Network(file='regent_street_network.json')" + "regent_street_network = base.Network(file='regent_street_network.json')" ] }, { @@ -402,7 +402,7 @@ "metadata": {}, "outputs": [], "source": [ - "# shelton_street_network = base.Network(file='shelton_street_network.json')" + "shelton_street_network = base.Network(file='shelton_street_network.json')" ] }, { @@ -440,7 +440,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/notebooks/quickstart.ipynb b/notebooks/quickstart.ipynb index 88e0b96..ccc6d8b 100644 --- a/notebooks/quickstart.ipynb +++ b/notebooks/quickstart.ipynb @@ -90,7 +90,7 @@ }, "outputs": [], "source": [ - "n = 3\n", + "n = 4\n", "network = base.Network(company_id=company_id)\n", "network.perform_hop(n)" ] @@ -161,7 +161,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/sugartrail/api.py b/sugartrail/api.py index 2a885a9..337da61 100644 --- a/sugartrail/api.py +++ b/sugartrail/api.py @@ -1,6 +1,7 @@ import requests import time import os +import functools access_token = "" username = "" @@ -8,7 +9,18 @@ password = "" size = "5000" basic_auth = requests.auth.HTTPBasicAuth(username, password) +def auth(func): + """Checks if user has set API Key.""" + @functools.wraps(func) + def wrapper_auth(*args, **kwargs): + if not basic_auth.username: + print("Authentication required") + else: + func(*args, **kwargs) + return wrapper_auth + def test(): + """Test auth.""" url = "https://api.company-information.service.gov.uk/advanced-search/companies" response = requests.get(url, auth=basic_auth) if response.status_code == 200: @@ -16,54 +28,60 @@ def test(): else: return False +# @auth def make_request(url, input, input_type, response_type): - if basic_auth.username: - time.sleep(0.5) - try: - response = requests.get(url, auth=basic_auth) - response.raise_for_status() - if response.status_code == 200: - return response.json() - except requests.exceptions.RequestException as err: - # print (err, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) - return - except requests.exceptions.HTTPError as errh: - # print (errh, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) - return - except requests.exceptions.ConnectionError as errc: - # print (errc, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) - return - except requests.exceptions.Timeout as errt: - # print (errt, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) - return - else: - print("Authentication required") + """Query Companies House API.""" + time.sleep(0.5) + try: + response = requests.get(url, auth=basic_auth) + response.raise_for_status() + if response.status_code == 200: + return response.json() + except requests.exceptions.RequestException as err: + # print (err, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) + return + except requests.exceptions.HTTPError as errh: + # print (errh, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) + return + except requests.exceptions.ConnectionError as errc: + # print (errc, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) + return + except requests.exceptions.Timeout as errt: + # print (errt, f"{os.linesep}Failed to get {response_type} for {input_type}:", str(input)) + return def get_company_officers(company_id): + """Get officers for input company.""" url = "https://api.company-information.service.gov.uk/company/" + company_id + "/officers" return make_request(url, company_id, 'company', 'officers') def get_psc(company_id): + """Get PSCs for input company.""" url = "https://api.company-information.service.gov.uk/company/" + company_id + "/persons-with-significant-control" return make_request(url, company_id, 'company', 'psc') def get_company(company_id): + """Get company records for input company.""" url = "https://api.company-information.service.gov.uk/company/" + company_id return make_request(url, company_id, 'company', 'company') def get_address_changes(company_id): + """Get address changes for input company.""" url = "https://api.company-information.service.gov.uk/company/" + str(company_id) + "/filing-history/?category=address" return make_request(url, company_id, 'company', 'address history') def get_correspondance_address(officer_id): + """Get correspondance address for input officer.""" url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments?size=" + size return make_request(url, officer_id, 'officer', 'correspondance address') def get_appointments(officer_id): + """Get appointments for input officer.""" url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments" return make_request(url, officer_id, 'officer', 'appointments') def get_duplicate_officers(officer_id): + """Get duplicate officers for input officer.""" url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments" response = make_request(url, officer_id, 'officer', 'appointments') if response: @@ -85,10 +103,12 @@ def get_duplicate_officers(officer_id): return def get_companies_at_address(address): + """Get companies at input address location.""" url = "https://api.company-information.service.gov.uk/advanced-search/companies?location=" + address + "&size=" + "5000" return make_request(url, address, 'address', 'companies') def get_officers_at_address(address): + """Get officers at input address location.""" url = "https://api.company-information.service.gov.uk/search/officers?q=location:" + address response = make_request(url, address, 'address', 'officers') if response: diff --git a/sugartrail/base.py b/sugartrail/base.py index d922db7..9648983 100644 --- a/sugartrail/base.py +++ b/sugartrail/base.py @@ -1,13 +1,13 @@ -from sugartrail import api -from sugartrail import processing +import sugartrail import IPython -import numpy as np -import math -import warnings import json +import functools from string import ascii_letters as alc class Network: + """Class represents a network of connected companies, officers and + addresses. Class contains methods to build network of user defined size from + a single seed company, officer or address.""" def __init__(self, officer_id=None, company_id=None, address=None, file=None): self.addresses = [] self.officer_ids = [] @@ -26,80 +26,128 @@ class Network: self.processed_companies = [] self.processed_addresses = [] self._file = self.load(file) - self.initialise() + self.initialise_node(officer_id, company_id, address, file) + + def clear_state(func): + """Resets the class attributes to pre-init state.""" + @functools.wraps(func) + def wrapper_clear(*args, **kwargs): + args[0].addresses = [] + args[0].officer_ids = [] + args[0].company_ids = [] + args[0].companies = [] + args[0].address_history = [] + args[0]._officer_id = None + args[0]._company_id = None + args[0]._address = None + args[0].n = 0 + args[0].link_type = None + args[0].hop_history = [] + args[0].maxsize_entities = [] + args[0].processed_officers = [] + args[0].processed_companies = [] + args[0].processed_addresses = [] + func(*args, **kwargs) + return wrapper_clear + + @property + def officer_id(self): + """officer_id property representing seed officer.""" + return self._officer_id + + @officer_id.setter + @sugartrail.api.auth + def officer_id(self, new_value): + """officer_id setter that checks if officer_id exists in Companies House before setting value.""" + officer_info = sugartrail.api.get_appointments(new_value) + if officer_info: + self._officer_id = new_value + self.officer_ids = [{ + 'officer_id': new_value, + 'name': officer_info['items'][0]['name'], + 'n':self.n, + 'link_type': None, + 'node_type': None, + 'node': None}] + else: + print(f"Officer with ID:{str(new_value)} not found") + self._officer_id = None + + @property + def company_id(self): + """company_id property representing seed company.""" + return self._company_id + + @company_id.setter + @sugartrail.api.auth + def company_id(self, new_value): + """company_id setter that checks if company_id exists in Companies House before setting value.""" + company_info = sugartrail.api.get_company(new_value) + if company_info: + self._company_id = new_value + self.company_ids = [{ + 'company_id': self._company_id, + 'n':self.n, + 'link_type': '', + 'node_type': '', + 'node': ''}] + self.companies = [dict(sugartrail.processing.flatten(company_info))] + else: + print(f"Company with ID:{str(new_value)} not found") + self._company_id = None + + @property + def address(self, value): + """address property representing seed address.""" + return self._address + + @address.setter + @sugartrail.api.auth + def address(self, new_value): + """address setter.""" + self._address = new_value + self.addresses = [dict({'address': self._address, + 'n':self.n, + 'link_type': '', + 'node_type': '', + 'node': ''})] @property def file(self): + """file property for loading pre-built network data into class.""" return self._file @file.setter def file(self, new_value): + """file setter for loading pre-built network data into class.""" self._file = new_value - self._officer_id = None - self._company_id = None - self._address_id = None self.load(self._file) - @property - def officer_id(self): - return self._officer_id - - @officer_id.setter - def officer_id(self, new_value): - self._officer_id = new_value - self._company_id = None - self._address_id = None - self.initialise() - - @property - def company_id(self): - return self._company_id - - @company_id.setter - def company_id(self, new_value): - self._company_id = new_value - self._officer_id = None - self._address_id = None - self.initialise() - - @property - def address(self, value): - return self._address - - @address.setter - def address(self, new_value): - self._address = new_value - self._company_id = None - self._officer_id = None - self.initialise() - - # change to initialise - def initialise(self): + @clear_state + def initialise_node(self, officer_id, company_id, address ,file): + """Builds initial network from arguments.""" if self.n < 1: - if self._officer_id: - if api.get_appointments(self._officer_id): - self.officer_ids.append(dict({'officer_id': self._officer_id, 'name': api.get_appointments(self._officer_id)['items'][0]['name'], 'n':self.n, 'link_type': None, 'node_type': None, 'node': None})) - else: - print(f"Officer with ID:{str(self._officer_id)} not found") - elif self._company_id: - self.company_ids.append(dict({'company_id': self._company_id, 'n':self.n, 'link_type': '', 'node_type': '', 'node': ''})) - company = api.get_company(self._company_id) - self.companies.append(dict(processing.flatten(company))) - elif self._address: - self.addresses.append(dict({'address': self._address, 'n':self.n, 'link_type': '', 'node_type': '', 'node': ''})) - elif self.file: - pass + if officer_id: + self.officer_id = officer_id + elif company_id: + self.company_id = company_id + elif address: + self.address = address + elif file: + self.file = file else: - print("No input provided. Please provide either officer_id, company_id or address value as input.") + print("No input provided. Please provide either officer_id, company_id, address or file as input.") def save(self, filename, location='../assets/networks/'): + """Saves network in JSON format to '../assets/networks/'.""" network_data = {k: v for k, v in self.__dict__.items() if k != 'hop' and k != 'file'} saved_network = json.dumps(network_data) - f = open(f'{filename}', 'w') + f = open(location + f'{filename}', 'w') f.write(saved_network) f.close def load(self, filename): + """Loads network stored in JSON format from '../assets/networks/'.""" if filename: f = open(f'../assets/networks/{filename}') network_data = json.load(f) @@ -119,77 +167,72 @@ class Network: self.processed_companies = network_data['processed_companies'] self.processed_addresses = network_data['processed_addresses'] - def add_company_names(self): - for i, row in enumerate(self.company_ids): - company_details = list(filter(lambda d: d.get('company_number') == row['company_id'], self.companies)) - if company_details: - self.company_ids[i]['company_name'] = company_details[0]['company_name'] - else: - company_details = api.get_company(row['company_id']) - if company_details: - if 'company_name' in company_details: - self.company_ids[i]['company_name'] = company_details['company_name'] + def run_map_preprocessing(self): + """Gets missing/additional information on companies and addresses required for + mapping them. This includes address histories, company records and coordinates.""" + self.get_address_histories() + self.get_company_records_from_id() + self.get_coords() + return - def get_company_from_id(self, company_df=None, company_id=None, print_progress=True): - company_list = [] - if company_id: - if company_id in [company['company_id'] for company in self.company_ids]: - company_list = [company_id] - else: - print("add valid company id") - else: - company_list = [company['company_id'] for company in self.company_ids] + def get_address_histories(self): + """Gets missing address histories for companies at the edge of the network.""" + historic_address_company_ids = list(dict.fromkeys([company['company_number'] for company in self.address_history])) + for i, company in enumerate(self.company_ids): + IPython.display.clear_output(wait=True) + print("Updated " + str(i+1) + "/" + str(len(self.company_ids)) + " company addresses.") + # if company is at the edge of the network: + # if historic address not in + if company['company_id'] not in historic_address_company_ids: + historic_address_company_ids.append(company['company_id']) + address_history = sugartrail.processing.build_address_history(company['company_id']) + historic_addresses = [] + for historic_address in address_history: + if historic_address not in self.address_history: + historic_addresses.append(historic_address) + self.address_history.extend(historic_addresses) + + def get_company_records_from_id(self, company_df=None, print_progress=True): + """Gets company records for all company IDs in the network. Additionally + enriches company_ids with company names for improved readability.""" + company_list = [company['company_id'] for company in self.company_ids] companies = [] for i, company_id in enumerate(company_list): IPython.display.clear_output(wait=True) if print_progress: print("Processed " + str(i+1) + "/" + str(len(company_list)) + " companies.") if company_id not in [company['company_number'] for company in self.companies]: + # if using local Companies House data if company_df is not None: try: company = company_df[company_df[" CompanyNumber"] == str(company_id)]["CompanyName"].item() if company: - # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) companies.append(company) except: try: - company = api.get_company(company_id) + company = sugartrail.api.get_company(company_id) if company: - # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) companies.append(company) except: print(f"Failed to get data for {company_id}") + # otherwise uses API else: - company = api.get_company(company_id) + company = sugartrail.api.get_company(company_id) if company: - # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) companies.append(company) - # add companies to dataframe + # update company_ids with company name + self.company_ids[i]['company_name'] = company['company_name'] + else: + self.company_ids[i]['company_name'] = list(filter(lambda d: d.get('company_number') == company_id, self.companies))[0]['company_name'] self.companies.extend(companies) - def run_map_preprocessing(self): - # merge some of these methods: - self.update_address_history() - self.get_company_from_id() - self.add_company_names() - self.get_coords() - self.address_history = [dict(t) for t in {tuple(d.items()) for d in self.address_history}] - return - - def update_address_history(self): - for i, company in enumerate(self.company_ids): - IPython.display.clear_output(wait=True) - print("Updated " + str(i+1) + "/" + str(len(self.company_ids)) + " company addresses.") - if company['n'] == self.n: - address_history = processing.build_address_history(company['company_id']) - self.address_history.extend(address_history) - def get_coords(self): + """Gets coordinates for each address in addresses and address_history.""" for i, row in enumerate(self.addresses): IPython.display.clear_output(wait=True) print("Processed " + str(i+1) + "/" + str(len(self.addresses)) + " addresses.") if 'lat' not in row or 'lon' not in row: - coords = processing.get_coords_from_address(row['address']) + coords = sugartrail.processing.get_coords_from_address(row['address']) if coords: self.addresses[i]['lat'] = coords['lat'] self.addresses[i]['lon'] = coords['lon'] @@ -199,23 +242,26 @@ class Network: self.address_history[j]['lon'] = coords['lon'] self.address_history[j]['lat'] = coords['lat'] else: + # no coords found self.addresses[i]['lat'] = "" self.addresses[i]['lon'] = "" - print("No coords found: " + row['address']) def find_path(self, select_company): - # network_link_type_rows = self.company_ids.loc[self.company_ids['company_id'] == select_company] + """Finds path from 'select_company' to origin company'.""" + # retrieve rows containing selected company: network_link_type_rows = list(filter(lambda d: d.get('company_id') == select_company, self.company_ids)) path = [] - company_info = self.get_company_from_id(company_id=select_company, print_progress=False) + # iterate through each path from selected company to seed company: for i, row in enumerate(network_link_type_rows): + # insert end of path node: path.insert(0, {'hop': row['n'], "type": "Company", "id": select_company, "node": row['company_name'], "node_type": row['link_type'], "link_id": row['node']}) + # define search terms for locating connected nodes: search_terms = [{'n': row['n']-1, 'node_type':row['node_type'], 'node':row['node']}] + # iterate through degrees of seperation till origin is reached: for j in range(row['n']-1,-1,-1): for term in search_terms: if term['n'] == j: if term['node_type'] == "Address": - ### select_rows = list(filter(lambda d: d.get('address') == term['node'] and d.get('n') == j, self.addresses)) for k, select_row in enumerate(select_rows): if select_row['n'] == 0: @@ -231,7 +277,6 @@ class Network: elif term['node_type'] == "Company": select_rows = list(filter(lambda d: d.get('company_id') == term['node'] and d.get('n') == j, self.company_ids)) for l, select_row in enumerate(select_rows): - self.get_company_from_id(company_id=select_row['company_id'], print_progress=False) if select_row['n'] == 0: origin = {'hop': j, "type": "Company", "id": select_row['company_id'], "node": select_row['company_name'], "node_type": "", "link_id": ""} if origin not in path: @@ -259,6 +304,7 @@ class Network: print(f"{row['node_type']} is invalid node_type") break sorted_path = sorted(path, key=lambda d: d['hop']) + # add letter correspondance for readability for i in range(len(sorted_path)-1,-1,-1): search_term = sorted_path[i]['link_id'] link_indices = [] @@ -270,9 +316,12 @@ class Network: return sorted_path def perform_hop(self, hops, company_data=None): + """Gets companies, officers and addresses within n-degrees of seperation + from current nodes, where n is the number of hops.""" hop_history = [] for hop in range(hops): - # check if previous hop completed, if any processed items then its still mid-processing: + # select the nodes for which the method will retrieve other nodes + # 1-degree of seperation from: selected_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') == self.n, self.addresses))] selected_companies = [company['company_id'] for company in list(filter(lambda d: d.get('n') == self.n, self.company_ids))] selected_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') == self.n, self.officer_ids))] @@ -281,26 +330,32 @@ class Network: break else: for i,address in enumerate(selected_addresses): + # in-case method was run previously and failed to complete, + # check if address was previously processed: if address not in self.processed_addresses: self.hop.search_address(self, address, company_data) self.processed_addresses.append(address) IPython.display.clear_output(wait=True) - print("Hop number: " + str(hop)) + print("Hop number: " + str(hop+1)) print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.") for j,company in enumerate(selected_companies): + # in-case method was run previously and failed to complete, + # check if company was previously processed: if company not in self.processed_companies: self.hop.search_company_id(self,company) self.processed_companies.append(company) IPython.display.clear_output(wait=True) - print("Hop number: " + str(hop)) + print("Hop number: " + str(hop+1)) print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies.") for k,officer in enumerate(selected_officers): + # in-case method was run previously and failed to complete, + # check if officer was previously processed: if officer not in self.processed_officers: self.hop.search_officer_id(self,officer) self.processed_officers.append(officer) IPython.display.clear_output(wait=True) - print("Hop number: " + str(hop)) + print("Hop number: " + str(hop+1)) print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.") print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.") @@ -318,6 +373,9 @@ class Network: self.hop_history.extend(hop_history) class Hop: + """Class attributes store the criteria for each hop. Class contains + methods for getting officers, addresses and companies using the + criteria.""" def __init__(self): self.get_company_officers = True self.get_company_address_history = True @@ -333,57 +391,68 @@ class Network: self.companies_at_address_maxsize = 50 def search_company_id(self, network, company_id): + """Gets officers and addresses connected to input company + (company_id).""" officers = [] new_addresses = [] new_officers = [] if self.get_company_officers: - officers = api.get_company_officers(company_id) + # get officers at company + officers = sugartrail.api.get_company_officers(company_id) if officers: - officers = officers['items'] + if 'items' in officers: + officers = officers['items'] + # process officer results network.node_type = "Company" network.node = company_id + # find addresses and officers already added to the network lower_n_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') < network.n+1, network.addresses))] lower_n_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') < network.n+1, network.officer_ids))] if officers: for officer in officers: - if 'address' in officer: - if processing.normalise_address(officer['address']) not in lower_n_addresses: - network.link_type = "Officer Corresponance Address" - new_address = {'address': processing.normalise_address(officer['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} - if new_address not in new_addresses: - new_addresses.append(new_address) - # network.addresses = network.addresses.append({'address': processing.normalise_address(officer['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) - if officer['links']['officer']['appointments'].split('/')[2] not in lower_n_officers: - network.link_type = "Officer" - new_officer = {'officer_id': str(officer['links']['officer']['appointments'].split('/')[2]), 'name': processing.normalise_name(officer['name']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} - if new_officer not in new_officers: - new_officers.append(new_officer) - # network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['officer']['appointments'].split('/')[2], 'name': processing.normalise_name(officer['name']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + # if 'address' in officer: + # # check address not already in the network + # if sugartrail.processing.normalise_address(officer['address']) not in lower_n_addresses: + # network.link_type = "Officer Corresponance Address" + # new_address = {'address': sugartrail.processing.normalise_address(officer['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + # if new_address not in new_addresses: + # new_addresses.append(new_address) + # # check not already in the network + if officer['links']['officer']['appointments'].split('/')[2] not in lower_n_officers: + network.link_type = "Officer" + new_officer = {'officer_id': str(officer['links']['officer']['appointments'].split('/')[2]), 'name': sugartrail.processing.normalise_name(officer['name']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_officer not in new_officers: + new_officers.append(new_officer) if self.get_psc_correspondance_address: - psc = api.get_psc(company_id) + # get address for company pscs + psc = sugartrail.api.get_psc(company_id) if psc: - for person in psc['items']: - if "address" in person: - network.link_type = "Person of Significant Control Address" - if processing.normalise_address(person['address']) not in lower_n_addresses: - new_address = {'address': processing.normalise_address(person['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} - if new_address not in new_addresses: - new_addresses.append(new_address) + if 'items' in psc: + for person in psc['items']: + if "address" in person: + network.link_type = "Person of Significant Control Address" + if sugartrail.processing.normalise_address(person['address']) not in lower_n_addresses: + new_address = {'address': sugartrail.processing.normalise_address(person['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append(new_address) if self.get_company_address_history: - address_history = processing.build_address_history(company_id) + # get company address history + address_history = sugartrail.processing.build_address_history(company_id) network.address_history.extend(address_history) for address in address_history: network.link_type = "Historic Address" - if address['address'] not in lower_n_addresses: - new_address = {'address': address['address'], 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} - if new_address not in new_addresses: - new_addresses.append(dict({'address': address['address'], 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node})) - # network.addresses = network.addresses.append({'address': address['address'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + if 'address' in address: + if address['address'] not in lower_n_addresses: + new_address = {'address': address['address'], 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append(dict({'address': address['address'], 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node})) network.addresses.extend(new_addresses) network.officer_ids.extend(new_officers) def search_officer_id(self, network, officer_id): + """Gets officers, companies and addresses connected to input officer + (officer_id).""" new_addresses = [] new_companies = [] new_officers = [] @@ -392,13 +461,13 @@ class Network: lower_n_addresses = [address['address'] for address in list(filter(lambda d: d.get('n') < network.n+1, network.addresses))] lower_n_officers = [officer['officer_id'] for officer in list(filter(lambda d: d.get('n') < network.n+1, network.officer_ids))] lower_n_companies = [company['company_id'] for company in list(filter(lambda d: d.get('n') < network.n+1, network.company_ids))] - appointments = api.get_appointments(officer_id) + appointments = sugartrail.api.get_appointments(officer_id) if appointments: if self.officer_appointments_maxsize == None or len(appointments['items']) < int(self.officer_appointments_maxsize or 0): for appointment in appointments['items']: - if processing.normalise_address(appointment['address']) not in lower_n_addresses: + if sugartrail.processing.normalise_address(appointment['address']) not in lower_n_addresses: network.link_type = "Appointment Address" - new_address = {'address': processing.normalise_address(appointment['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + new_address = {'address': sugartrail.processing.normalise_address(appointment['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} if new_address not in new_addresses: new_addresses.append(new_address) if appointment['appointed_to']['company_number'] not in lower_n_companies: @@ -409,15 +478,15 @@ class Network: elif len(appointments['items']) > int(self.officer_appointments_maxsize): network.maxsize_entities.append(dict({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Appointments', 'size': len(appointments['items'])})) if self.get_officer_correspondance_address: - correspondance_address = api.get_correspondance_address(officer_id) + correspondance_address = sugartrail.api.get_correspondance_address(officer_id) if correspondance_address: - if processing.normalise_address(correspondance_address['items'][0]['address']) not in lower_n_addresses: + if sugartrail.processing.normalise_address(correspondance_address['items'][0]['address']) not in lower_n_addresses: network.link_type = "Officer Corresponance Address" - new_address = {'address': processing.normalise_address(correspondance_address['items'][0]['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + new_address = {'address': sugartrail.processing.normalise_address(correspondance_address['items'][0]['address']), 'n':network.n+1, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} if new_address not in new_addresses: new_addresses.append(new_address) if self.get_officer_duplicates: - duplicate_officers = api.get_duplicate_officers(officer_id) + duplicate_officers = sugartrail.api.get_duplicate_officers(officer_id) if duplicate_officers: if self.officer_duplicates_maxsize == None or len(duplicate_officers) < int(self.officer_duplicates_maxsize or 0): for duplicate in duplicate_officers: @@ -433,6 +502,8 @@ class Network: network.company_ids.extend(new_companies) def search_address(self, network, address, company_data): + """Gets officers, companies and addresses connected to input officer + (officer_id).""" new_companies = [] new_officers = [] network.node_type = "Address" @@ -443,9 +514,9 @@ class Network: if self.get_companies_at_address: companies = {} if company_data is not None: - companies['items'] = processing.get_companies_from_address_database(address, company_data) + companies['items'] = sugartrail.processing.get_companies_from_address_database(address, company_data) else: - companies = api.get_companies_at_address(address) + companies = sugartrail.api.get_companies_at_address(address) if companies: if 'items' in companies: if self.companies_at_address_maxsize == None or len(companies['items']) < int(self.companies_at_address_maxsize or 0): @@ -458,7 +529,7 @@ class Network: elif len(companies['items']) > int(self.companies_at_address_maxsize): network.maxsize_entities.append(dict({'node':address,'type': 'Address', 'maxsize_type': 'Companies', 'size': len(companies['items'])})) if self.get_officers_at_address: - officers = api.get_officers_at_address(address) + officers = sugartrail.api.get_officers_at_address(address) if officers: if self.officers_at_address_maxsize == None or len(officers) < int(self.officers_at_address_maxsize or 0): for officer in officers: diff --git a/sugartrail/mapview.py b/sugartrail/mapview.py index c5e8823..0143c03 100644 --- a/sugartrail/mapview.py +++ b/sugartrail/mapview.py @@ -5,12 +5,120 @@ import functools import math def build_map(network, clear_widget=True): + """Generates map and table for displaying paths for input network data.""" if clear_widget: Widget.close_all() m, path_table = load_map_data(network) return m, path_table +def load_map_data(network): + """Adds data from input network to map in 3 layers; marker_cluster, + address_trail and origin_trail. marker_cluster contains all the companies + in the network geolocated, address_trail contains all the historic address + antpaths and origin_trail contains all the antpaths connecting companies + through other companies towards the origin company.""" + # initialise historic address trail antpath + address_trail = AntPath( + locations=[], + dash_array=[1,10], + delay=1000, + color='#ed2f2f', + pulse_color='#FFFFFF' + ) + # initialise trail from company to origin antpath + origin_trail = AntPath( + locations=[], + dash_array=[1,10], + delay=1000, + color='#000000', + pulse_color='#FFFFFF' + ) + # initialise table for printing company to origin trail + path_table = HTML( + value="" + ) + # initialise map + m = Map(center=(50, 0), + zoom=5, + layout=Layout(width='90%', height='650px')) + # add antpath layers + m.add_layer(address_trail) + m.add_layer(origin_trail) + # add marker for each company in network + marker_cluster = MarkerCluster( + center=(50, 0), + markers=get_marker_data(network, address_trail, origin_trail, path_table), + disable_clustering_at_zoom = 25, + max_cluster_radius = 25 + ) + # add markers as layer + m.add_layer(marker_cluster) + return m, path_table + +def get_marker_data(network,address_trail, origin_trail, path_table): + """Generates a marker for each company historic address.""" + markers = [] + for index, row in enumerate(network.address_history): + if row['lat'] and row['lon']: + marker_color = "green" + # locate company at historic address + company = list(filter(lambda d: d.get('company_number') == row['company_number'], network.companies))[0] + company_name = company['company_name'] + company_status = company['company_status'] + if company_status == "active": + if row['end_date']: + marker_color = "red" + else: + marker_color = "black" + address = row['address'] + # find path from company to origin + path = network.find_path(str(row['company_number'])) + locations_from_origin = locations_from_origin_path(path, network) + message = HTML() + message.value = str(company_name) + "
" + str(address) + icon = AwesomeIcon( + marker_color=marker_color + ) + # find historic addresses path for company + address_path = get_address_path(network,str(row['company_number'])) + marker = Marker(icon=icon, opacity=1, location=(row['lat'], row['lon']), draggable=False, popup=message, title="Address") + # attach on click behavoir for marker + marker.on_click(functools.partial(on_button_clicked, address_path=address_path, address_trail=address_trail, path_table=path_table, origin_trail=origin_trail, path=path, location=(row['lat'], row['lon']), locations_from_origin = locations_from_origin)) + markers.append(marker) + return markers + +def locations_from_origin_path(path, network): + """Returns list of addresses found within origin path.""" + locations = [] + for node in path: + if node['type'] == 'Company': + # finds location for company node + company_address_history = list(filter(lambda d: d.get('company_number') == node['id'], network.address_history)) + company_address_history_sorted = sorted(company_address_history, key=lambda d: d['start_date'], reverse=True) + last_company_address_row = {} + for address_row in company_address_history_sorted: + if address_row['lat'] and address_row['lon']: + last_company_address_row = address_row + break + if last_company_address_row: + lat = last_company_address_row['lat'] + lon = last_company_address_row['lon'] + if not lat or not lon: + pass + else: + locations.append([lat,lon]) + elif node['type'] == 'Address': + address_row = list(filter(lambda d: d.get('address') == node['node'], network.addresses))[0] + lat = address_row['lat'] + lon = address_row['lon'] + if not lat or not lon: + pass + else: + locations.append([lat,lon]) + return locations + def get_address_path(network, company_id): + """Returns list of historic addresses for input company (company_id).""" company_address_history = list(filter(lambda d: d.get('company_number') == company_id, network.address_history)) company_address_history_sorted = sorted(company_address_history, key=lambda d: d['start_date'], reverse=True) address_path = [] @@ -21,38 +129,8 @@ def get_address_path(network, company_id): address_path.insert(0,[row['lat'], row['lon']]) return address_path -def locations_from_origin_path(path, network): - locations = [] - for node in path: - if node['type'] == 'Company': - ### - company_address_history = list(filter(lambda d: d.get('company_number') == node['id'], network.address_history)) - company_address_history_sorted = sorted(company_address_history, key=lambda d: d['start_date'], reverse=True) - last_company_address_row = {} - for address_row in company_address_history_sorted: - if address_row['lat'] and address_row['lon']: - last_company_address_row = address_row - break - # last_company_address_row = list(filter(lambda d: d.get('company_number') == node['id'], network.address_history))[0] - if last_company_address_row: - lat = last_company_address_row['lat'] - lon = last_company_address_row['lon'] - if not lat or not lon: - pass - else: - locations.append([lat,lon]) - elif node['type'] == 'Address': - address_row = list(filter(lambda d: d.get('address') == node['node'], network.addresses))[0] - # address_row = network.addresses.loc[network.addresses['address'] == node['node']].iloc[:1] - lat = address_row['lat'] - lon = address_row['lon'] - if not lat or not lon: - pass - else: - locations.append([lat,lon]) - return locations - def on_button_clicked(address_path, path, location, address_trail, path_table, origin_trail, locations_from_origin, **kwargs): + """Adds data to map layers that will render when marker is clicked.""" address_trail.locations = address_path locations_from_origin[-1] = location origin_trail.locations = locations_from_origin @@ -60,6 +138,7 @@ def on_button_clicked(address_path, path, location, address_trail, path_table, o return def html_table_generator(path): + """Generates table for displaying origin path data.""" table_style = '' headers = ['Node Index', 'Node', 'Hop', 'Node Type', 'Link'] headers_row = "" @@ -70,67 +149,3 @@ def html_table_generator(path): nodes += '' + node['node_index'] + '' + str(node['node']) + '' + str(node['hop']) + '' + str(node['node_type']) + '' + str(node['link']) + '' table_html = table_style + '' + headers_row + '' + nodes + '
' return table_html - -def load_map_data(network): - address_trail = AntPath( - locations=[], - dash_array=[1,10], - delay=1000, - color='#ed2f2f', - pulse_color='#FFFFFF' - ) - origin_trail = AntPath( - locations=[], - dash_array=[1,10], - delay=1000, - color='#000000', - pulse_color='#FFFFFF' - ) - path_table = HTML( - value="" - ) - m = Map(center=(50, 0), - zoom=5, - layout=Layout(width='90%', height='650px')) - m.add_layer(address_trail) - m.add_layer(origin_trail) - marker_cluster = MarkerCluster( - center=(50, 0), - markers=get_marker_data(network, address_trail, origin_trail, path_table), - disable_clustering_at_zoom = 25, - max_cluster_radius = 25 - ) - m.add_layer(marker_cluster) - return m, path_table - -def get_marker_data(network,address_trail, origin_trail, path_table): - address_trail=address_trail - origin_trail=origin_trail - ms = [] - for index, row in enumerate(network.address_history): - if row['lat'] and row['lon']: - path = "" - locations_from_origin = "" - message = HTML() - marker_color = "green" - company = list(filter(lambda d: d.get('company_number') == row['company_number'], network.companies))[0] - # company = network.companies.loc[network.companies['company_number'] == row['company_number']] - company_name = company['company_name'] - company_status = company['company_status'] - if company_status == "active": - if row['end_date']: - marker_color = "red" - else: - marker_color = "black" - address = row['address'] - path = network.find_path(str(row['company_number'])) - locations_from_origin = locations_from_origin_path(path, network) - message.value = str(company_name) + "
" + str(address) - icon = AwesomeIcon( - marker_color=marker_color - ) - address_path = get_address_path(network,str(row['company_number'])) - marker = Marker(icon=icon, opacity=1, location=(row['lat'], row['lon']), draggable=False, popup=message, title="Address") - marker.on_click(functools.partial(on_button_clicked, address_path=address_path, address_trail=address_trail, path_table=path_table, origin_trail=origin_trail, path=path, location=(row['lat'], row['lon']), locations_from_origin = locations_from_origin)) - ms.append(marker) - return ms diff --git a/sugartrail/processing.py b/sugartrail/processing.py index b1486da..92ffc8c 100644 --- a/sugartrail/processing.py +++ b/sugartrail/processing.py @@ -7,6 +7,7 @@ import regex as re import collections def flatten(d, parent_key='', sep='.'): + """Flatten nested dictionary.""" items = [] for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k @@ -17,6 +18,7 @@ def flatten(d, parent_key='', sep='.'): return dict(items) def infer_postcode(address_string): + """Extracts UK postcode from input address string with regex.""" postcode = re.findall(r'\b[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}\b', address_string) if postcode: return postcode[0] @@ -24,12 +26,15 @@ def infer_postcode(address_string): return def get_companies_from_address_database(address, company_data): + """Searches input dataframe (company_data) for companies at input address + (address) and returns list of dicts.""" companies = company_data[company_data[' RegAddress.AddressLine2'].apply(lambda x: str(x).upper() in address.upper()) | company_data['RegAddress.AddressLine1'].apply(lambda x: str(x).upper() in address.upper()) & company_data['RegAddress.PostCode'].apply(lambda x: str(x).upper() in address.upper())] companies = companies.rename(columns={'CompanyName': 'company_name', ' CompanyNumber': 'company_number', 'CompanyStatus': 'company_status', 'CompanyCategory': 'company_type', 'RegAddress.AddressLine1': 'address_line_1', ' RegAddress.AddressLine2': 'address_line_2', 'RegAddress.PostCode': 'postal_code', 'RegAddress.PostTown': 'locality', 'RegAddress.Country': 'country', 'IncorporationDate':'date_of_creation', 'DissolutionDate': 'date_of_cessation'}) companies['registered_office_address'] = [{'address_line_1': row['address_line_1'], 'address_line_2': row['address_line_2'], 'locality': row['locality'], 'postal_code': row['postal_code'], 'country': row['country']} for i,row in companies.iterrows()] return companies.to_dict('records') def get_nearby_postcode(postcode_string): + """Find closest nearby postcode to input postcode (postcode_string).""" url = "http://api.postcodes.io/postcodes/" + postcode_string[:-1] + "/autocomplete" response = requests.get(url).json() if response['result'] != None: @@ -44,6 +49,7 @@ def get_nearby_postcode(postcode_string): return closest_address["postcode"] def get_coords_from_address(address_string): + """Attempt retrieval of coords for input address string.""" address = urllib.parse.quote(address_string) url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json' response = requests.get(url).json() @@ -70,11 +76,14 @@ def get_coords_from_address(address_string): print("No postcode found for: " + address_string) def normalise_name(name): + """Move first word (often surname) from the beginning to the end of string.""" name_list = name.replace(',','').split(" ") name_list.append(name_list.pop(0)) return ' '.join(name_list) def process_address_changes(address_changes): + """Attempt retrieval of 'new_address' value if Companies House record is + incomplete.""" for i in reversed(range(1,len(address_changes['items']))): if 'new_address' not in address_changes['items'][i]['description_values'].keys(): if 'old_address' in address_changes['items'][i-1]['description_values'].keys(): @@ -82,6 +91,8 @@ def process_address_changes(address_changes): return address_changes def build_address_history(company_id): + """Returns a list of dicts containing historic addresses for input company + (company_id).""" company_info = api.get_company(company_id) if company_info: company_info_subset = {k:company_info[k] for k in ("date_of_creation","date_of_cessation","registered_office_address") if k in company_info} @@ -89,6 +100,7 @@ def build_address_history(company_id): address_keys = ('start_date','end_date','address') if address_changes: if address_changes['items']: + # attempt to retrieve any missing items within address changes address_changes = process_address_changes(address_changes) addresses = [] entry = {} @@ -148,6 +160,7 @@ def build_address_history(company_id): return [] def normalise_address(address_dict): + """Joins address key values into a single str.""" address_list = [] for key in ['premises','address_line_1', 'locality','postal_code', 'country']: if key in address_dict: