sugartrail update

This commit is contained in:
seangreaves
2022-12-27 21:46:30 +00:00
parent 8acac7b40f
commit e115000064
6 changed files with 128 additions and 90 deletions

View File

@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"id": "f17ebdd2",
"metadata": {},
"outputs": [],
@@ -29,16 +29,16 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"id": "4a9639e6",
"metadata": {},
"outputs": [],
"source": [
"# # network build from Domain Foundation, company_id = \"11951034\"\n",
"# import pickle\n",
"import pickle\n",
"\n",
"# with open('assets/networks/domain_corp_network.pickle', 'rb') as handle:\n",
"# network = pickle.load(handle)"
"with open('assets/networks/domain_corp_network.pickle', 'rb') as handle:\n",
" network = pickle.load(handle)"
]
},
{
@@ -118,7 +118,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"id": "01dca0cf",
"metadata": {
"scrolled": true,
@@ -126,7 +126,23 @@
"7"
]
},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a3cd4271e1074f26b5b9cc0a8bd8c430",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(Map(center=[50, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title',…"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"map_data,path_table = mapview.build_map(network) \n",
"hbox = HBox([path_table])\n",

View File

@@ -30,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "81c37bf3",
"metadata": {},
"outputs": [],
@@ -537,7 +537,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
"version": "3.9.12"
}
},
"nbformat": 4,

View File

@@ -47,7 +47,7 @@ def get_correspondance_address(officer_id):
def get_appointments(officer_id):
url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments"
return make_request(url, officer_id, 'officer', 'appointments')['items']
return make_request(url, officer_id, 'officer', 'appointments')
def get_duplicate_officers(officer_id):
url = "https://api.company-information.service.gov.uk/officers/" + officer_id + "/appointments"

View File

@@ -65,7 +65,8 @@ class Network:
self.officer_ids = self.officer_ids.iloc[0:0]
self.addresses = self.addresses.iloc[0:0]
if self._officer_id:
self.officer_ids = self.officer_ids.append({'officer_id': self._officer_id, 'name': api.get_appointments(self._officer_id)[0]['name'], 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}, ignore_index=True)
if api.get_appointments(self._officer_id):
self.officer_ids = self.officer_ids.append({'officer_id': self._officer_id, 'name': api.get_appointments(self._officer_id)['items'][0]['name'], 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}, ignore_index=True)
elif self.company_id:
self.company_ids = self.company_ids.append({'company_id': self._company_id, 'n':self.n, 'link_type': None, 'node_type': None, 'node': None}, ignore_index=True)
company = api.get_company(self._company_id)
@@ -134,7 +135,7 @@ class Network:
path = []
company_info = self.get_company_from_id(company_id=select_company, print_progress=False)
for i, row in network_link_type_rows.iterrows():
path.insert(0, {'n': row['n'], "type": "Company", "id": select_company, "value": self.companies[self.companies["company_number"] == select_company]['company_name'].item(), "link_type": row['link_type'], "link": row['node']})
path.insert(0, {'hop': row['n'], "type": "Company", "id": select_company, "node": self.companies[self.companies["company_number"] == select_company]['company_name'].item(), "node_type": row['link_type'], "link_id": row['node']})
search_terms = [{'n': row['n']-1, 'node_type':row['node_type'], 'node':row['node']}]
for j in range(row['n']-1,-1,-1):
for term in search_terms:
@@ -143,27 +144,26 @@ class Network:
select_rows = self.addresses.loc[(self.addresses['address'] == term['node']) & (self.addresses['n'] == j)]
for k, select_row in select_rows.iterrows():
if select_row['n'] == 0:
origin = {'n': j, "type": "Address", "id": select_row['address'], "value": select_row['address'], "link_type": "", "link": ""}
origin = {'hop': j, "type": "Address", "id": select_row['address'], "node": select_row['address'], "node_type": "", "link_id": ""}
if origin not in path:
path.insert(0, origin)
break
else:
item = {'n': j, "type": "Address", "id": select_row['address'], "value": select_row['address'], "link_type": select_row['link_type'], "link": select_row['node']}
item = {'hop': j, "type": "Address", "id": select_row['address'], "node": select_row['address'], "node_type": select_row['link_type'], "link_id": select_row['node']}
if item not in path:
path.insert(0, item)
search_terms.append({'n': j-1, 'node_type':select_row['node_type'], 'node':select_row['node']})
break
elif term['node_type'] == "Company":
select_rows = self.company_ids.loc[(self.company_ids['company_id'] == term['node']) & (self.company_ids['n'] == j)]
for l, select_row in select_rows.iterrows():
self.get_company_from_id(company_id=select_row['company_id'], print_progress=False)
if select_row['n'] == 0:
origin = {'n': j, "type": "Company", "id": select_row['company_id'], "value": self.companies[self.companies["company_number"] == select_row['company_id']]['company_name'].item(), "link_type": "", "link": ""}
origin = {'hop': j, "type": "Company", "id": select_row['company_id'], "node": self.companies[self.companies["company_number"] == select_row['company_id']]['company_name'].item(), "node_type": "", "link_id": ""}
if origin not in path:
path.insert(0, origin)
break
else:
item = {'n': j, "type": "Company", "id": select_row['company_id'], "value": self.companies[self.companies["company_number"] == select_row['company_id']]['company_name'].item(), "link_type": select_row['link_type'], "link": select_row['node']}
item = {'hop': j, "type": "Company", "id": select_row['company_id'], "node": self.companies[self.companies["company_number"] == select_row['company_id']]['company_name'].item(), "node_type": select_row['link_type'], "link_id": select_row['node']}
if item not in path:
path.insert(0, item)
search_terms.append({'n': j-1, 'node_type':select_row['node_type'], 'node':select_row['node']})
@@ -171,31 +171,30 @@ class Network:
select_rows = self.officer_ids.loc[(self.officer_ids['officer_id'] == term['node']) & (self.officer_ids['n'] == j)]
for m, select_row in select_rows.iterrows():
if select_row['link_type'] == 0:
origin = {'n': j, "type": "Person", "id": select_row["officer_id"], "value": select_row['name'], "link_type": "", "link": ""}
origin = {'hop': j, "type": "Person", "id": select_row["officer_id"], "node": select_row['name'], "node_type": "", "link_id": ""}
if origin not in path:
path.insert(0, origin)
break
else:
item = {'n': j, "type": "Person", "id": select_row["officer_id"], "value": str(select_row['name']), "link_type": str(select_row['link_type']), "link": select_row['node']}
item = {'hop': j, "type": "Person", "id": select_row["officer_id"], "node": str(select_row['name']), "node_type": str(select_row['link_type']), "link_id": select_row['node']}
if item not in path:
path.insert(0, item)
search_terms.append({'n': j-1, 'node_type':select_row['node_type'], 'node':select_row['node']})
break
else:
print(f"{row['node_type']} is invalid node_type")
break
sorted_path = sorted(path, key=lambda d: d['n'])
sorted_path = sorted(path, key=lambda d: d['hop'])
for i in range(len(sorted_path)-1,-1,-1):
search_term = sorted_path[i]['link']
search_term = sorted_path[i]['link_id']
link_indices = []
for j,item in enumerate(sorted_path):
if item['id'] == search_term:
link_indices.append(alc[j].upper())
sorted_path[i]["links_to"] = ','.join(link_indices)
sorted_path[i]["link"] = ','.join(link_indices)
sorted_path[i]["node_index"] = alc[i].upper()
return sorted_path
def perform_hop(self, hops):
def perform_hop(self, hops, company_data=None):
for hop in range(hops):
selected_addresses = self.addresses.loc[self.addresses['n'] == self.n]['address']
selected_companies = self.company_ids.loc[self.company_ids['n'] == self.n]['company_id']
@@ -207,7 +206,7 @@ class Network:
self.n += 1
self.hop_history = self.hop_history.append(self.hop.__dict__, ignore_index=True)
for i,address in enumerate(selected_addresses):
self.hop.search_address(self, address)
self.hop.search_address(self, address, company_data)
IPython.display.clear_output(wait=True)
print("Hop number: " + str(hop+1))
print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.")
@@ -243,8 +242,9 @@ class Network:
def search_company_id(self, network, company_id):
officers = []
if self.get_company_officers:
if api.get_company_officers(company_id):
officers = api.get_company_officers(company_id)['items']
officers = api.get_company_officers(company_id)
if officers:
officers = officers['items']
network.node_type = "Company"
network.node = company_id
if officers:
@@ -277,18 +277,18 @@ class Network:
def search_officer_id(self, network, officer_id):
network.node_type = "Person"
network.node = officer_id
if self.get_officer_appointments:
appointments = api.get_appointments(officer_id)
if self.officer_appointments_maxsize == None or len(appointments) < int(self.officer_appointments_maxsize or 0):
for appointment in appointments:
appointments = api.get_appointments(officer_id)
if appointments:
if self.officer_appointments_maxsize == None or len(appointments['items']) < int(self.officer_appointments_maxsize or 0):
for appointment in appointments['items']:
if processing.normalise_address(appointment['address']) not in network.addresses[network.addresses['n'] < network.n]['address'].unique():
network.link_type = "Appointment Address"
network.addresses = network.addresses.append({'address': processing.normalise_address(appointment['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
if appointment['appointed_to']['company_number'] not in network.company_ids[network.company_ids['n'] < network.n]['company_id'].unique():
network.link_type = "Appointment"
network.company_ids = network.company_ids.append({'company_id': appointment['appointed_to']['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
elif len(appointments) > int(self.officer_appointments_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Appointments', 'size': len(appointments)}, ignore_index=True)
elif len(appointments['items']) > int(self.officer_appointments_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Appointments', 'size': len(appointments['items'])}, ignore_index=True)
if self.get_officer_correspondance_address:
correspondance_address = api.get_correspondance_address(officer_id)
if correspondance_address:
@@ -297,38 +297,48 @@ class Network:
network.addresses = network.addresses.append({'address': processing.normalise_address(correspondance_address['items'][0]['address']), 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
if self.get_officer_duplicates:
duplicate_officers = api.get_duplicate_officers(officer_id)
if self.officer_duplicates_maxsize == None or len(duplicate_officers) < int(self.officer_duplicates_maxsize or 0):
for duplicate in duplicate_officers:
network.link_type = "Duplicate Officer"
if duplicate['links']['self'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique():
network.officer_ids = network.officer_ids.append({'officer_id': duplicate['links']['self'].split('/')[2], 'name': duplicate['title'], 'n':network.n, 'link_type': network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
elif len(duplicate_officers) > int(self.officer_duplicates_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Duplicates', 'size': len(duplicate_officers)}, ignore_index=True)
network.addresses = network.addresses.drop_duplicates().reset_index(drop=True)
if duplicate_officers:
if self.officer_duplicates_maxsize == None or len(duplicate_officers) < int(self.officer_duplicates_maxsize or 0):
for duplicate in duplicate_officers:
network.link_type = "Duplicate Officer"
if duplicate['links']['self'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique():
network.officer_ids = network.officer_ids.append({'officer_id': duplicate['links']['self'].split('/')[2], 'name': duplicate['title'], 'n':network.n, 'link_type': network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
elif len(duplicate_officers) > int(self.officer_duplicates_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':officer_id,'type': 'Officer', 'maxsize_type': 'Duplicates', 'size': len(duplicate_officers)}, ignore_index=True)
network.addresses = network.addresses.drop_duplicates().reset_index(drop=True)
network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True)
network.company_ids = network.company_ids.drop_duplicates().reset_index(drop=True)
def search_address(self, network, address):
def search_address(self, network, address, company_data):
network.node_type = "Address"
network.node = address
if self.get_companies_at_address:
companies = api.get_companies_at_address(address)
# database method here:
companies = {}
if company_data is not None:
companies['items'] = processing.get_companies_from_address_database(address, company_data)
else:
companies = api.get_companies_at_address(address)
if companies:
if self.companies_at_address_maxsize == None or len(companies['items']) < int(self.companies_at_address_maxsize or 0):
company_ids = []
for company in companies['items']:
network.link_type = "Company at Address"
if company['company_number'] not in network.company_ids[network.company_ids['n'] < network.n]['company_id'].unique():
network.company_ids = network.company_ids.append({'company_id': company['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
company_ids.append({'company_id': company['company_number'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node})
network.company_ids = network.company_ids.append(company_ids, ignore_index=True)
elif len(companies['items']) > int(self.companies_at_address_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':address,'type': 'Address', 'maxsize_type': 'Companies', 'size': len(companies['items'])},ignore_index=True)
if self.get_officers_at_address:
officers = api.get_officers_at_address(address)
if self.officers_at_address_maxsize == None or len(officers) < int(self.officers_at_address_maxsize or 0):
for officer in officers:
network.link_type = "Officer at Address"
if officer['links']['self'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique():
network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['self'].split('/')[2], 'name': officer['title'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
elif len(officers) > int(self.officers_at_address_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':address,'type': 'Address', 'maxsize_type': 'Officers', 'size': len(officers)},ignore_index=True)
if officers:
if self.officers_at_address_maxsize == None or len(officers) < int(self.officers_at_address_maxsize or 0):
for officer in officers:
network.link_type = "Officer at Address"
if officer['links']['self'].split('/')[2] not in network.officer_ids[network.officer_ids['n'] < network.n]['officer_id'].unique():
network.officer_ids = network.officer_ids.append({'officer_id': officer['links']['self'].split('/')[2], 'name': officer['title'], 'n':network.n, 'link_type':network.link_type, 'node_type': network.node_type, 'node': network.node}, ignore_index=True)
elif len(officers) > int(self.officers_at_address_maxsize):
network.maxsize_entities = network.maxsize_entities.append({'node':address,'type': 'Address', 'maxsize_type': 'Officers', 'size': len(officers)},ignore_index=True)
network.officer_ids = network.officer_ids.drop_duplicates().reset_index(drop=True)
network.company_ids = network.company_ids.drop_duplicates().reset_index(drop=True)

View File

@@ -26,7 +26,7 @@ def locations_from_origin_path(path, network):
lon = last_company_address_row['lon'].item()
locations.append([float(lat),float(lon)])
elif node['type'] == 'Address':
address_row = network.addresses.loc[network.addresses['address'] == node['value']].iloc[:1]
address_row = network.addresses.loc[network.addresses['address'] == node['node']].iloc[:1]
lat = address_row['lat'].item()
lon = address_row['lon'].item()
locations.append([float(lat),float(lon)])
@@ -47,7 +47,7 @@ def html_table_generator(path):
headers_row += '<th>' + header + '</th>'
nodes = ""
for i, node in enumerate(path):
nodes += '<tr><td>' + node['node_index'] + '</td><td>' + str(node['value']) + '</td><td>' + str(node['n']) + '</td><td>' + str(node['link_type']) + '</td><td>' + str(node['links_to']) + '</td></tr>'
nodes += '<tr><td>' + node['node_index'] + '</td><td>' + str(node['node']) + '</td><td>' + str(node['hop']) + '</td><td>' + str(node['node_type']) + '</td><td>' + str(node['link']) + '</td></tr>'
table_html = table_style + '<table><tr>' + headers_row + '</tr>' + nodes + '</table>'
return table_html
@@ -103,7 +103,7 @@ def get_marker_data(network,address_trail, origin_trail, path_table):
address = row['address']
path = network.find_path(str(row['company_number']))
locations_from_origin = locations_from_origin_path(path, network)
message.value = company_name + "<hr>" + address
message.value = str(company_name) + "<hr>" + str(address)
icon = AwesomeIcon(
marker_color=marker_color
)

View File

@@ -12,6 +12,12 @@ def infer_postcode(address_string):
else:
return
def get_companies_from_address_database(address, company_data):
companies = company_data[company_data[' RegAddress.AddressLine2'].apply(lambda x: str(x).upper() in address.upper()) | company_data['RegAddress.AddressLine1'].apply(lambda x: str(x).upper() in address.upper()) & company_data['RegAddress.PostCode'].apply(lambda x: str(x).upper() in address.upper())]
companies = companies.rename(columns={'CompanyName': 'company_name', ' CompanyNumber': 'company_number', 'CompanyStatus': 'company_status', 'CompanyCategory': 'company_type', 'RegAddress.AddressLine1': 'address_line_1', ' RegAddress.AddressLine2': 'address_line_2', 'RegAddress.PostCode': 'postal_code', 'RegAddress.PostTown': 'locality', 'RegAddress.Country': 'country', 'IncorporationDate':'date_of_creation', 'DissolutionDate': 'date_of_cessation'})
companies['registered_office_address'] = [{'address_line_1': row['address_line_1'], 'address_line_2': row['address_line_2'], 'locality': row['locality'], 'postal_code': row['postal_code'], 'country': row['country']} for i,row in companies.iterrows()]
return companies.to_dict('records')
def load_company_data(company_data_filepath):
try:
company_data = pd.read_csv(company_data_filepath)
@@ -73,46 +79,52 @@ def process_address_changes(address_changes):
def build_address_history(company_id):
company_info = api.get_company(company_id)
company_info_subset = {k:company_info[k] for k in ("date_of_creation","date_of_cessation","registered_office_address") if k in company_info}
address_changes = api.get_address_changes(company_id)
address_keys = ('start_date','end_date','address')
if address_changes['items']:
address_changes = process_address_changes(address_changes)
addresses = []
entry = {}
entry["company_number"] = str(company_id)
entry["address"] = str(normalise_address(company_info_subset['registered_office_address']))
entry["start_date"] = str(address_changes['items'][0]['date'])
if 'date_of_cessation' in company_info_subset:
entry["end_date"] = str(company_info_subset['date_of_cessation'])
if company_info:
company_info_subset = {k:company_info[k] for k in ("date_of_creation","date_of_cessation","registered_office_address") if k in company_info}
address_changes = api.get_address_changes(company_id)
address_keys = ('start_date','end_date','address')
if address_changes:
if address_changes['items']:
address_changes = process_address_changes(address_changes)
addresses = []
entry = {}
entry["company_number"] = str(company_id)
entry["address"] = str(normalise_address(company_info_subset['registered_office_address']))
entry["start_date"] = str(address_changes['items'][0]['date'])
if 'date_of_cessation' in company_info_subset:
entry["end_date"] = str(company_info_subset['date_of_cessation'])
else:
entry["end_date"] = None
addresses.append(entry)
for i,change in enumerate(address_changes['items']):
entry = {}
entry["company_number"] = str(company_id)
if 'old_address' in change['description_values']:
entry["address"] = change['description_values']['old_address']
else:
entry["address"] = ""
if i+1 < len(address_changes['items']):
entry["start_date"] = str(address_changes['items'][i+1]['date'])
else:
entry["start_date"] = company_info_subset['date_of_creation']
entry["end_date"] = str(change['date'])
addresses.append(entry)
return addresses
else:
return []
else:
entry["end_date"] = None
addresses.append(entry)
for i,change in enumerate(address_changes['items']):
address_history = []
entry = {}
for k, key in enumerate(["date_of_creation","date_of_cessation","registered_office_address"]):
if key in company_info:
entry[address_keys[k]] = company_info[key]
else:
entry[address_keys[k]] = None
entry["company_number"] = str(company_id)
if 'old_address' in change['description_values']:
entry["address"] = change['description_values']['old_address']
else:
entry["address"] = ""
if i+1 < len(address_changes['items']):
entry["start_date"] = str(address_changes['items'][i+1]['date'])
else:
entry["start_date"] = company_info_subset['date_of_creation']
entry["end_date"] = str(change['date'])
addresses.append(entry)
return addresses
entry['address'] = normalise_address(entry['address'])
return [entry]
else:
address_history = []
entry = {}
for k, key in enumerate(["date_of_creation","date_of_cessation","registered_office_address"]):
if key in company_info:
entry[address_keys[k]] = company_info[key]
else:
entry[address_keys[k]] = None
entry["company_number"] = str(company_id)
entry['address'] = normalise_address(entry['address'])
return [entry]
return []
def normalise_address(address_dict):
address_list = []