Skip to content

Commit

Permalink
Merge pull request #48 from uktrade/feature/refactor-sources
Browse files Browse the repository at this point in the history
Feature/refactor sources
  • Loading branch information
leo-mazzone authored Jan 23, 2025
2 parents 15361e1 + 6fd7a94 commit f8b0dcc
Show file tree
Hide file tree
Showing 51 changed files with 1,890 additions and 1,641 deletions.
50 changes: 0 additions & 50 deletions .github/workflows/docs.yml

This file was deleted.

2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ format:

# Scan for secrets
scan:
trufflehog git file://. --only-verified
bash -c "docker run -v "$(pwd):/repo" -i --rm trufflesecurity/trufflehog:latest git file:///repo"

# Run Python tests
test:
Expand Down
53 changes: 0 additions & 53 deletions sample.datasets.toml

This file was deleted.

5 changes: 3 additions & 2 deletions src/matchbox/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from dotenv import find_dotenv, load_dotenv

from matchbox.client.helpers.cleaner import process
from matchbox.client.helpers.selector import query
from matchbox.client.helpers.index import index
from matchbox.client.helpers.selector import match, query
from matchbox.client.models.models import make_model

__all__ = ("make_model", "to_clusters", "process", "query")
__all__ = ("make_model", "process", "query", "match", "index")

dotenv_path = find_dotenv(usecwd=True)
load_dotenv(dotenv_path)
4 changes: 1 addition & 3 deletions src/matchbox/client/_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@


def url(path: str) -> str:
"""
Return path prefixed by API root, determined from environment
"""
"""Return path prefixed by API root, determined from environment"""
api_root = getenv("API__ROOT")
if api_root is None:
raise RuntimeError("API__ROOT needs to be defined in the environment")
Expand Down
59 changes: 0 additions & 59 deletions src/matchbox/client/admin.py

This file was deleted.

31 changes: 8 additions & 23 deletions src/matchbox/client/clean/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ def company_name(
column_secondary: str = None,
stopwords: str = cu.STOPWORDS,
) -> DataFrame:
"""
Lower case, remove punctuation & tokenise the company name into an array.
"""Lower case, remove punctuation & tokenise the company name into an array.
Extract tokens into: 'unusual' and 'stopwords'. Dedupe. Sort alphabetically.
Untokenise the unusual words back to a string.
Expand All @@ -26,7 +25,6 @@ def company_name(
Returns:
dataframe: the same as went in, but cleaned
"""

remove_stopwords = partial(steps.remove_stopwords, stopwords=stopwords)

clean_primary = cu.cleaning_function(
Expand All @@ -48,16 +46,14 @@ def company_name(


def company_number(df: DataFrame, column: str) -> DataFrame:
"""
Remove non-numbers, and then leading zeroes.
"""Remove non-numbers, and then leading zeroes.
Args:
df: a dataframe
column: a column containing a company number
Returns:
dataframe: the same as went in, but cleaned
"""

clean_number = cu.cleaning_function(steps.remove_notnumbers_leadingzeroes)

df = clean_number(df, column)
Expand All @@ -66,8 +62,7 @@ def company_number(df: DataFrame, column: str) -> DataFrame:


def postcode(df: DataFrame, column: str) -> DataFrame:
"""
Removes all punctuation, converts to upper, removes all spaces.
"""Removes all punctuation, converts to upper, removes all spaces.
Args:
df: a dataframe
Expand All @@ -76,7 +71,6 @@ def postcode(df: DataFrame, column: str) -> DataFrame:
dataframe: the same as went in, but cleaned
"""

clean_postcode = cu.cleaning_function(
steps.punctuation_to_spaces, steps.to_upper, steps.remove_whitespace
)
Expand All @@ -87,16 +81,14 @@ def postcode(df: DataFrame, column: str) -> DataFrame:


def postcode_to_area(df: DataFrame, column: str) -> DataFrame:
"""
Extracts postcode area from a postcode
"""Extracts postcode area from a postcode
Args:
df: a dataframe
column: a column containing a postcode
Returns:
dataframe: the same as went in, but cleaned
"""

extract_area = cu.cleaning_function(steps.get_postcode_area)

df = extract_area(df, column)
Expand All @@ -107,8 +99,7 @@ def postcode_to_area(df: DataFrame, column: str) -> DataFrame:
def extract_company_number_to_new(
df: DataFrame, column: str, new_column: str
) -> DataFrame:
"""
Detects the Companies House CRN in a column and moves it to a new column.
"""Detects the Companies House CRN in a column and moves it to a new column.
Args:
df: a dataframe
Expand All @@ -117,7 +108,6 @@ def extract_company_number_to_new(
Returns:
dataframe: the same as went in with a new column for CRNs
"""

clean_crn = cu.cleaning_function(
steps.clean_punctuation_except_hyphens,
steps.to_upper,
Expand All @@ -134,8 +124,7 @@ def extract_company_number_to_new(
def extract_duns_number_to_new(
df: DataFrame, column: str, new_column: str
) -> DataFrame:
"""
Detects the Dun & Bradstreet DUNS nuber in a column and moves it to
"""Detects the Dun & Bradstreet DUNS nuber in a column and moves it to
a new column.
Args:
Expand All @@ -145,7 +134,6 @@ def extract_duns_number_to_new(
Returns:
dataframe: the same as went in with a new column for DUNs numbers
"""

clean_duns = cu.cleaning_function(
steps.clean_punctuation_except_hyphens, steps.to_upper, steps.filter_duns_number
)
Expand All @@ -160,8 +148,7 @@ def extract_duns_number_to_new(
def extract_cdms_number_to_new(
df: DataFrame, column: str, new_column: str
) -> DataFrame:
"""
Detects the CDMS nuber in a column and moves it to a new column.
"""Detects the CDMS nuber in a column and moves it to a new column.
Args:
df: a dataframe
Expand All @@ -170,7 +157,6 @@ def extract_cdms_number_to_new(
Returns:
dataframe: the same as went in with a new column for CDMS numbers
"""

clean_cdms = cu.cleaning_function(
steps.clean_punctuation_except_hyphens, steps.to_upper, steps.filter_cdms_number
)
Expand All @@ -183,8 +169,7 @@ def extract_cdms_number_to_new(


def drop(df: DataFrame, column: str) -> DataFrame:
"""
Drops the column from the dataframe.
"""Drops the column from the dataframe.
Args:
df: a dataframe
Expand Down
Loading

0 comments on commit f8b0dcc

Please sign in to comment.