diff --git a/sugartrail/base.py b/sugartrail/base.py index fba6a42..c0ecea2 100644 --- a/sugartrail/base.py +++ b/sugartrail/base.py @@ -66,15 +66,16 @@ class Network: self.addresses = self.addresses.iloc[0:0] if self._officer_id: if api.get_appointments(self._officer_id): - self.officer_ids = self.officer_ids.append({'officer_id': self._officer_id, 'name': api.get_appointments(self._officer_id)['items'][0]['name'], 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}, ignore_index=True) + self.officer_ids = pd.DataFrame([{'officer_id': self._officer_id, 'name': api.get_appointments(self._officer_id)['items'][0]['name'], 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}]) elif self.company_id: - self.company_ids = self.company_ids.append({'company_id': self._company_id, 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}, ignore_index=True) + self.company_ids = pd.DataFrame([{'company_id': self._company_id, 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}]) company = api.get_company(self._company_id) # company['n'] = self.n - company['link_type'] = self.link_type - self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + # company['link_type'] = self.link_type + self.companies = pd.DataFrame(pd.json_normalize(company)) + # self.companies = pd.DataFrame([company]) elif self._address: - self.addresses = self.addresses.append({'address': self._address, 'n':self.n, 'link_type': None, 'node_type': None, 'node': None,}, ignore_index=True) + self.addresses = pd.DataFrame.from_dict([{'address': self._address, 'n':self.n, 'link_type': None, 'node_type': None, 'node': None,}]) else: print("No input provided. Please provide either officer_id, company_id or address value as input.") @@ -93,6 +94,8 @@ class Network: print("add valid company id") else: company_list = self.company_ids['company_id'].unique() + # companies + companies = [] for i, company_id in enumerate(company_list): IPython.display.clear_output(wait=True) if print_progress: @@ -102,18 +105,23 @@ class Network: try: company = company_df[company_df[" CompanyNumber"] == str(company_id)]["CompanyName"].item() if company: - self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + companies.append(company) except: try: company = api.get_company(company_id) if company: - self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + companies.append(company) except: print(f"Failed to get data for {company_id}") else: company = api.get_company(company_id) if company: - self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + # self.companies = self.companies.append(pd.json_normalize(company), ignore_index=True) + companies.append(company) + # add companies to dataframe + self.companies = self.companies.append(companies, ignore_index=True) def run_map_preprocessing(self): self.get_company_from_id() @@ -202,6 +210,7 @@ class Network: return sorted_path def perform_hop(self, hops, company_data=None): + hop_history = [] for hop in range(hops): selected_addresses = self.addresses.loc[self.addresses['n'] == self.n]['address'] selected_companies = self.company_ids.loc[self.company_ids['n'] == self.n]['company_id'] @@ -211,7 +220,8 @@ class Network: break else: self.n += 1 - self.hop_history = self.hop_history.append(self.hop.__dict__, ignore_index=True) + hop_history.append(self.hop.__dict__) + # self.hop_history = self.hop_history.append(self.hop.__dict__, ignore_index=True) for i,address in enumerate(selected_addresses): self.hop.search_address(self, address, company_data) IPython.display.clear_output(wait=True) @@ -230,6 +240,7 @@ class Network: print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.") print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.") print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.") + self.hop_history = self.hop_history.append(hop_history) class Hop: def __init__(self): @@ -248,6 +259,8 @@ class Network: def search_company_id(self, network, company_id): officers = [] + new_addresses = [] + new_officers = [] if self.get_company_officers: officers = api.get_company_officers(company_id) if officers: @@ -258,10 +271,16 @@ class Network: for officer in officers: if processing.normalise_address(officer['address']) not in network.addresses[network.addresses['n'] < network.n]['address'].unique(): network.link_type = "Officer Corresponance Address" - network.addresses = network.addresses.append({'address': processing.normalise_address(officer['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_address = {'address': processing.normalise_address(officer['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append(new_address) + # network.addresses = network.addresses.append({'address': processing.normalise_address(officer['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) if officer['links']['officer']['appointments'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique(): network.link_type = "Officer" - network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['officer']['appointments'].split('/')[2], 'name': processing.normalise_name(officer['name']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_officer = {'officer_id': str(officer['links']['officer']['appointments'].split('/')[2]), 'name': processing.normalise_name(officer['name']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_officer not in new_officers: + new_officers.append(new_officer) + # network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['officer']['appointments'].split('/')[2], 'name': processing.normalise_name(officer['name']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) if self.get_psc_correspondance_address: psc = api.get_psc(company_id) if psc: @@ -269,19 +288,31 @@ class Network: if "address" in person: network.link_type = "Person of Significant Control Address" if processing.normalise_address(person['address']) not in network.addresses[network.addresses['n'] < network.n]['address'].unique(): - network.addresses = network.addresses.append({'address': processing.normalise_address(person['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_address = {'address': processing.normalise_address(person['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append(new_address) + # network.addresses = network.addresses.append({'address': processing.normalise_address(person['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) if self.get_company_address_history: address_history = processing.build_address_history(company_id) network.address_history = network.address_history.append(address_history, ignore_index=True) for address in address_history: network.link_type = "Historic Address" if address['address'] not in network.addresses[network.addresses['n'] < network.n]['address'].unique(): - network.addresses = network.addresses.append({'address': address['address'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) - network.address_history = network.address_history.drop_duplicates().reset_index(drop=True) - network.addresses = network.addresses.drop_duplicates().reset_index(drop=True) - network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True) + new_address = {'address': address['address'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append({'address': address['address'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}) + # network.addresses = network.addresses.append({'address': address['address'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + network.addresses = network.addresses.append(new_addresses, ignore_index=True) + network.officer_ids = network.officer_ids.append(new_officers, ignore_index=True) + # network.addresses = network.addresses.drop_duplicates().reset_index(drop=True) + # network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True) + # network.address_history = network.address_history.drop_duplicates().reset_index(drop=True) + def search_officer_id(self, network, officer_id): + new_addresses = [] + new_companies = [] + new_officers = [] network.node_type = "Person" network.node = officer_id appointments = api.get_appointments(officer_id) @@ -290,10 +321,16 @@ class Network: for appointment in appointments['items']: if processing.normalise_address(appointment['address']) not in network.addresses[network.addresses['n'] < network.n]['address'].unique(): network.link_type = "Appointment Address" - network.addresses = network.addresses.append({'address': processing.normalise_address(appointment['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_address = {'address': processing.normalise_address(appointment['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append(new_address) + # network.addresses = network.addresses.append({'address': processing.normalise_address(appointment['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) if appointment['appointed_to']['company_number'] not in network.company_ids[network.company_ids['n'] < network.n]['company_id'].unique(): network.link_type = "Appointment" - network.company_ids = network.company_ids.append({'company_id': appointment['appointed_to']['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + # network.company_ids = network.company_ids.append({'company_id': appointment['appointed_to']['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_company = {'company_id': appointment['appointed_to']['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_company not in new_companies: + new_companies.append(new_company) elif len(appointments['items']) > int(self.officer_appointments_maxsize): network.maxsize_entities = network.maxsize_entities.append({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Appointments', 'size': len(appointments['items'])}, ignore_index=True) if self.get_officer_correspondance_address: @@ -301,7 +338,10 @@ class Network: if correspondance_address: if processing.normalise_address(correspondance_address['items'][0]['address']) not in network.addresses[network.addresses['n'] < network.n]['address'].unique(): network.link_type = "Officer Corresponance Address" - network.addresses = network.addresses.append({'address': processing.normalise_address(correspondance_address['items'][0]['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_address = {'address': processing.normalise_address(correspondance_address['items'][0]['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_address not in new_addresses: + new_addresses.append(new_address) + # network.addresses = network.addresses.append({'address': processing.normalise_address(correspondance_address['items'][0]['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) if self.get_officer_duplicates: duplicate_officers = api.get_duplicate_officers(officer_id) if duplicate_officers: @@ -309,18 +349,25 @@ class Network: for duplicate in duplicate_officers: network.link_type = "Duplicate Officer" if duplicate['links']['self'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique(): - network.officer_ids = network.officer_ids.append({'officer_id': duplicate['links']['self'].split('/')[2], 'name': duplicate['title'], 'n':network.n, 'link_type': network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_officer = {'officer_id': duplicate['links']['self'].split('/')[2], 'name': duplicate['title'], 'n':network.n, 'link_type': network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_officer not in new_officers: + new_officers.append(new_officer) + # network.officer_ids = network.officer_ids.append({'officer_id': duplicate['links']['self'].split('/')[2], 'name': duplicate['title'], 'n':network.n, 'link_type': network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) elif len(duplicate_officers) > int(self.officer_duplicates_maxsize): network.maxsize_entities = network.maxsize_entities.append({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Duplicates', 'size': len(duplicate_officers)}, ignore_index=True) - network.addresses = network.addresses.drop_duplicates().reset_index(drop=True) - network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True) - network.company_ids = network.company_ids.drop_duplicates().reset_index(drop=True) + network.addresses = network.addresses.append(new_addresses) + network.officer_ids = network.officer_ids.append(new_officers, ignore_index=True) + network.company_ids = network.company_ids.append(new_companies, ignore_index=True) + # network.addresses = network.addresses.drop_duplicates().reset_index(drop=True) + # network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True) + # network.company_ids = network.company_ids.drop_duplicates().reset_index(drop=True) def search_address(self, network, address, company_data): + new_companies = [] + new_officers = [] network.node_type = "Address" network.node = address if self.get_companies_at_address: - # database method here: companies = {} if company_data is not None: companies['items'] = processing.get_companies_from_address_database(address, company_data) @@ -328,15 +375,14 @@ class Network: companies = api.get_companies_at_address(address) if companies: if self.companies_at_address_maxsize == None or len(companies['items']) < int(self.companies_at_address_maxsize or 0): - company_ids = [] for company in companies['items']: network.link_type = "Company at Address" if company['company_number'] not in network.company_ids[network.company_ids['n'] < network.n]['company_id'].unique(): - company_ids.append({'company_id': company['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}) - network.company_ids = network.company_ids.append(company_ids, ignore_index=True) + new_company = {'company_id': company['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_company not in new_companies: + new_companies.append(new_company) elif len(companies['items']) > int(self.companies_at_address_maxsize): network.maxsize_entities = network.maxsize_entities.append({'node':address,'type': 'Address', 'maxsize_type': 'Companies', 'size': len(companies['items'])},ignore_index=True) - if self.get_officers_at_address: officers = api.get_officers_at_address(address) if officers: @@ -344,8 +390,13 @@ class Network: for officer in officers: network.link_type = "Officer at Address" if officer['links']['self'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique(): - network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['self'].split('/')[2], 'name': officer['title'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) + new_officer = {'officer_id': officer['links']['self'].split('/')[2], 'name': officer['title'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node} + if new_officer not in new_officers: + new_officers.append(new_officer) + # network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['self'].split('/')[2], 'name': officer['title'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True) elif len(officers) > int(self.officers_at_address_maxsize): network.maxsize_entities = network.maxsize_entities.append({'node':address,'type': 'Address', 'maxsize_type': 'Officers', 'size': len(officers)},ignore_index=True) - network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True) - network.company_ids = network.company_ids.drop_duplicates().reset_index(drop=True) + network.officer_ids = network.officer_ids.append(new_officers, ignore_index=True) + network.company_ids = network.company_ids.append(new_companies, ignore_index=True) + # network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True) + # network.company_ids = network.company_ids.drop_duplicates().reset_index(drop=True) diff --git a/sugartrail/mapview.py b/sugartrail/mapview.py index 1d5c82c..bf38773 100644 --- a/sugartrail/mapview.py +++ b/sugartrail/mapview.py @@ -1,6 +1,5 @@ from ipywidgets import HTML, Widget, Layout, Output, VBox, HBox, Textarea from ipyleaflet import Map, Marker, MarkerCluster, AwesomeIcon, AntPath, Popup -import pandas as pd from datetime import datetime import functools from string import ascii_lowercase as alc diff --git a/sugartrail/processing.py b/sugartrail/processing.py index 6cd2761..ffe4dae 100644 --- a/sugartrail/processing.py +++ b/sugartrail/processing.py @@ -1,6 +1,6 @@ from sugartrail import api import requests -import pandas as pd +# import pandas as pd import random import urllib import regex as re @@ -18,12 +18,12 @@ def get_companies_from_address_database(address, company_data): companies['registered_office_address'] = [{'address_line_1': row['address_line_1'], 'address_line_2': row['address_line_2'], 'locality': row['locality'], 'postal_code': row['postal_code'], 'country': row['country']} for i,row in companies.iterrows()] return companies.to_dict('records') -def load_company_data(company_data_filepath): - try: - company_data = pd.read_csv(company_data_filepath) - return company_data - except: - return +# def load_company_data(company_data_filepath): +# try: +# company_data = pd.read_csv(company_data_filepath) +# return company_data +# except: +# return def get_nearby_postcode(postcode_string): url = "http://api.postcodes.io/postcodes/" + postcode_string[:-1] + "/autocomplete"