Source code for send_the_raven.address

import pydantic
from typing import Optional, Iterable, Mapping, TypedDict
from .utils import generate_id, clean_string
from multiprocessing import Pool
from scourgify import normalize_address_record
from scourgify.exceptions import (
    UnParseableAddressError,
    AmbiguousAddressError,
    AddressNormalizationError,
)
from difflib import SequenceMatcher
from re import compile
from json import load
from importlib.resources import files
from usaddress import tag, RepeatedLabelError

STREET_ADDRESS_NUMBER_REGEX_PATTERN = compile(r"\d+")

DEFAULT_ADDRESS_MAPPING = {
    "full_address": "full_address",
    "street": "street",
    "address_line_2": "address_line_2",
    "city": "city",
    "state": "state",
    "zip_code": "zip_code",
    "id": "id",
}

STREET_KEYS = [
    "AddressNumber",
    "AddressNumberSuffix",
    "StreetNamePreDirectional",
    "StreetName",
    "StreetNamePostType",
]

UNIT_KEYS = ["OccupancyType", "OccupancyIdentifier"]


[docs]class Placemapper(TypedDict): """ Lookup table to perform state, city, or zip code lookups. Examples:: { "states": { "new york: "NY", "washington": "WA" }, "cities": { "boston": ["ma", "02354"], } } """ states: dict[str, str] cities: dict[str, list[str]]
with files("send_the_raven").joinpath( "states, cities, zipcodes.json" ).open() as dict_file: GEOGRAPHY_MAPPER: Placemapper = load(dict_file)
[docs]class Address(pydantic.BaseModel): """ Represents a US address. Args: full_address (str): full address. street (str): The street name and number. address_line_2 (str): The second line of the address. city (str): The city name. state (str): The state or province name. zip_code (str): The ZIP code or postal code. id (str): The address ID. If not present, one will be generated to keep track of the address. Attributes: full_address (str): full address. street (str): The street name and number. address_line_2 (str): The second line of the address. city (str): The city name. state (str): The state or province name. zip_code (str): The ZIP code or postal code. id (str): The address ID. """ full_address: Optional[str] = None street: Optional[str] = None address_line_2: Optional[str] = None city: Optional[str] = None state: Optional[str] = None zip_code: Optional[str] = None id: Optional[str] = generate_id() def __str__(self): """ Generates the full address from the address fields. Omitting empty or None value. """ not_empty_values = [ value for value in [ self.street, self.address_line_2, self.city, self.state, self.zip_code, ] if value is not None and value != "" ] return ", ".join(not_empty_values)
[docs] def normalize(self): """ Normalize address using https://github.com/GreenBuildingRegistry/usaddress-scourgify/ Warning: When address is not parseable, it will be set to None. No error would be thrown. """ normalized = None is_complete = True for field in ["street", "address_line_2", "city", "state", "zip_code"]: value = getattr(self, field) if value is None: is_complete = False continue cleaned_value = clean_string(value) setattr(self, field, cleaned_value) try: if is_complete: normalized = normalize_address_record( { "address_line_1": self.street or "", "address_line_2": self.address_line_2 or "", "city": self.city or "", "state": self.state or "", "postal_code": self.zip_code or "", } ) elif self.full_address is not None: normalized = normalize_address_record(self.full_address) else: normalized = normalize_address_record(self.__str__()) except ( UnParseableAddressError, AmbiguousAddressError, AddressNormalizationError, ): # if len(e.args) == 3: # normalized = e.args[2] # if any( # [ # value is not None and len(value) > 0 # for value in normalized.values() # ] # ): # self.extract_address() # return # else: # self.extract_address() # return self.extract_address() for field in ["street", "address_line_2", "city", "state", "zip_code"]: value = getattr(self, field) if value is None: continue setattr(self, field, value.upper()) if normalized is not None: self.street = normalized["address_line_1"] self.address_line_2 = normalized["address_line_2"] self.city = normalized["city"] self.state = normalized["state"] self.zip_code = normalized["postal_code"]
[docs] def extract_address(self): """ Manually extract address directly using usaddress library. """ try: result = tag( self.full_address if self.full_address is not None else self.__str__() ) except RepeatedLabelError as e: parsed = {} for value, key in e.parsed_string: if key in parsed: continue parsed[key] = value result = [parsed, "Street Address"] # extracting street street_words = [] for key in STREET_KEYS: if key in result[0]: street_words.append(result[0][key]) if len(street_words) > 0: self.street = " ".join(street_words) # extrating unit unit_words = [] for key in UNIT_KEYS: if key in result[0]: unit_words.append(result[0][key]) if len(unit_words) > 0: self.address_line_2 = " ".join(unit_words) # extracting city if "PlaceName" in result[0]: self.city = result[0]["PlaceName"] # extrating zipcode if "ZipCode" in result[0]: self.zip_code = result[0]["ZipCode"][:10] if "StateName" in result[0]: self.state = result[0]["StateName"][:10]
def __eq__(self, b) -> bool: """ Compare two addresses. Args: b (Address): Address object to compare. Example: >>> addr1 = Address(street="12 Main St", city="Boston", state="MA") >>> addr2 = Address(street="12 Main Street", city="Boston", state="MA") >>> assert addr1 == addr2 Returns: bool: True if :py:meth:`~send_the_raven.address.compare` greater than 0.73, False otherwise. """ return compare(self, b) > 0.73
[docs] def fill_in_city(self): """ Try to find the correct city by using difflib SequenceMatcher. If the ratio score is above 0.73, it will replace current city with it. Example: >>> addr = Address(city="San Fran") >>> addr.fill_in_city() >>> print(addr.city) San Francisco Warning: Expensive operation. """ if self.city is None: return highest = 0.0 current_city = "" for city in GEOGRAPHY_MAPPER["cities"]: score = SequenceMatcher(None, self.city.lower(), city).ratio() if score > highest: highest = score current_city = city if highest > 0.73: self.city = current_city return self.city
[docs] def fill_in_zipcode(self): """ Try to find the correct zipcode using city. USPS validation will have more successful rate when zip code is present even though it's the wrong zipcode but the city is the same. Example: >>> addr = Address(city="San Francisco") >>> print(addr.zipcode) None >>> addr.fill_in_zipcode() >>> print(addr.zipcode) 94105 """ if self.zip_code is not None: if len(self.zip_code) == 5: return self.zip_code if len(self.zip_code) == 4: self.zip_code = f"0{self.zip_code}" return self.zip_code if self.city is None: return self.zip_code if self.city.lower() in GEOGRAPHY_MAPPER["cities"]: self.zip_code = GEOGRAPHY_MAPPER["cities"][self.city.lower()][1] return self.zip_code
[docs] def fill_in_state(self): """ Try to convert state into its 2-letter abbreviation. If not found, use SequenceMatcher to find the closest one. USPS needs 2-letter abbreviation. Example: >>> addr = Address(state="California") >>> addr.fill_in_state() >>> print(addr.state) CA """ if self.state is None: return self.state if len(self.state) == 2: return self.state if self.state.lower() in GEOGRAPHY_MAPPER["states"]: self.state = GEOGRAPHY_MAPPER["states"][self.state.lower()] return self.state highest = 0.0 current_state = "" for state in GEOGRAPHY_MAPPER["states"]: score = SequenceMatcher(None, self.state.lower(), state).ratio() if score > highest: highest = score current_state = state if highest > 0.73: self.state = GEOGRAPHY_MAPPER["states"][current_state] return self.state
[docs]class Addresses: """ Represents a list of addresses. Args: addresses (iterable): iterable of any to construct Address object. field_mapping (dict): A dictionary of field names to be used when constructing the Address objects. The default is :py:const:`DEFAULT_ADDRESS_MAPPING`. The dict's keys are :py:class:`Address`'s attributes. The dict's values are the key in the input addresses. Attributes: addresses (list[Address]): A list of Address objects. field_mapping (dict): A dictionary of field names to be used when constructing the Address objects. Example: >>> data = [{'street address': '12 main st', 'city': boston}, {'street address': '14 main st', 'city': boston}] >>> field_mapper = {'street': 'street address'} >>> addresses = Addresses(data, field_mapper) """
[docs] def __init__( self, addresses: Iterable[Mapping[str, str] | Address], field_mapping: dict[str, str] = DEFAULT_ADDRESS_MAPPING, ): self.field_mapping = field_mapping for field in DEFAULT_ADDRESS_MAPPING: if field not in self.field_mapping: self.field_mapping[field] = field self.addresses = [ Address( **{k: address[v] for k, v in self.field_mapping.items() if v in address} ) if not isinstance(address, Address) else address for address in addresses ]
[docs] def normalize_parallel(self, number_of_proccesses: int | None = None): """ Normalize all addresses in parallel. Uses :py:func:`multiprocessing.Pool.map`. Args: number_of_proccesses (int): number of process """ with Pool(number_of_proccesses) as pool: pool.map(lambda x: x.normalize, self.addresses)
def __iter__(self): return iter(self.addresses) def __len__(self): return len(self.addresses) def __add__(self, b): self.addresses = self.addresses + b.addresses return self def __getitem__(self, index): return self.addresses[index] def __contains__(self, item) -> bool: if not isinstance(item, Address): raise TypeError("item must be an Address object") return item in self.addresses
def _process_street(street: str): """ Extract street into components for comparing purposes. Also extracting address number such as 151-153 boston ave into a list of numbers [151, 153] e.g 12 main st -> ([12], 'main st') 12-16 main st -> ([12, 14, 16], 'main st') 12 16 main st -> ([12, 14, 16], 'main st') Args: street (str): street string to extract returns: tuple[list[int], str]: street number and street name """ splitted = street.split(" ") street_number = splitted[0] if len(splitted) > 1 and splitted[1].isdigit(): street_number = f"{splitted[0]} {splitted[1]}" street_name = street.replace(street_number, "") street_numbers: list[str] = STREET_ADDRESS_NUMBER_REGEX_PATTERN.findall( street_number ) if len(street_numbers) == 2: try: already_sorted = sorted(street_numbers) upper = int(already_sorted[1]) lower = int(already_sorted[0]) if upper - lower > 2: for number in range(lower, upper + 2, 2): current_number = f"{number}" if current_number not in street_numbers: street_numbers.append(current_number) except ValueError: pass return (street_numbers, street_name)
[docs]def compare(a: Address, b: Address) -> float: """ This compares two Address objects and returns a similarity score between 0 and 1. The comparison takes into account differences in street number, street name, unit number, etc. Addresses should be :py:meth:`~send_the_raven.address.Address.normalize` prior to comparing for best results. return 0 if either a.street or b.street is None. doesn't check full_address Example: >>> addr1 = Address(street="12 Main St", city="Boston", state="MA") >>> addr2 = Address(street="12 Main Street", city="Boston", state="MA") >>> addr3 = Address(street="13 Main St", city="Boston", state="MA") >>> assert compare(addr1, addr3) > compare(addr2, addr3) The first comparison has a higher score than the second comparison because "12 Main St" is more similar to "12 Main Street" than "13 Main St". Args: a,b (Address): Address to compare returns: float: similarity between 0.0 and 1.0 See Also: :py:meth:`~send_the_raven.address.Address.__eq__` """ if a.street is None or b.street is None: return 0.0 # process street street_number1, street_name1 = _process_street(a.street) street_number2, street_name2 = _process_street(b.street) scores = 0 # compare street number if any([number in street_number2 for number in street_number1]): scores += 1 for current_element1, current_element2 in [ (street_name1, street_name2), (a.address_line_2, b.address_line_2), (a.city, b.city), (a.zip_code, b.zip_code), ]: if not (current_element1 is None or current_element2 is None): scores += SequenceMatcher(None, current_element1, current_element2).ratio() return scores / 5