5
\$\begingroup\$

Problem

The challenge is to create a flexible system for mapping job data from various external sources (such as Workable) into a standardised internal format. The system needed to handle:

  1. Diverse data structures from different job platforms
  2. Inconsistent field naming and value formats
  3. Conversion between different data types (e.g., strings to enums, timestamps to datetime objects)
  4. Mapping of varying terminology to standardized internal enums (e.g., job statuses, contract types)

Solution

I have tried to implement a configurable mapping system using Pydantic models, which allows for:

  1. Definition of source-specific data mappings
  2. Automatic data validation and type conversion
  3. Custom transformation functions for complex field mappings
  4. Extensibility to easily add new data sources or fields

Code

from enum import Enum
from typing import Any
from datetime import datetime
from decimal import Decimal
from pydantic import BaseModel, Field
RawData = dict[str, Any]
class TransformType(str, Enum):
 TO_INT = "to_int"
 TO_FLOAT = "to_float"
 TO_STR = "to_str"
 TO_DECIMAL = "to_decimal"
 TO_DATETIME = "to_datetime"
 TO_JOB_STATUS = "to_job_status"
 TO_CONTRACT_TYPE = "to_contract_type"
 TO_JOB_SOURCE = "to_job_source"
 TO_EXPERIENCE_LEVEL = "to_experience_level"
 TO_EDUCATION_LEVEL = "to_education_level"
class JobStatus(str, Enum):
 SCHEDULED = "SCHEDULED"
 ACTIVE = "ACTIVE"
 CLOSED = "CLOSED"
 ARCHIVED = "ARCHIVED"
class JobContractType(str, Enum):
 FULL_TIME = "FULL_TIME"
 PART_TIME = "PART_TIME"
 CONTRACT = "CONTRACT"
class JobSource(str, Enum):
 REGISTERED_ORG = "REGISTERED_ORG"
 MANUALLY_ENTERED = "MANUALLY_ENTERED"
 JOB_IMPORTER = "JOB_IMPORTER"
 TEST_DATA = "TEST_DATA"
 BREEZY_HR = "BREEZY_HR"
 WORKABLE = "WORKABLE"
class JobExperienceLevel(str, Enum):
 INTERN = "INTERN"
 JUNIOR = "JUNIOR"
 MID = "MID"
 SENIOR = "SENIOR"
 LEAD = "LEAD"
 HEAD = "HEAD"
 C_SUITE = "C_SUITE"
class JobEducationLevel(str, Enum):
 NO_FORMAL_EDUCATION = "NO_FORMAL_EDUCATION"
 GCSE = "GCSE"
 A_LEVEL = "A_LEVELS"
 FOUNDATION_DEGREE = "FOUNDATION_DEGREE"
 BACHELOR_DEGREE = "BACHELOR_DEGREE"
 POSTGRADUATE_DIPLOMA = "POSTGRADUATE_DIPLOMA"
 MASTER_DEGREE = "MASTER_DEGREE"
 DOCTORATE_DEGREE = "DOCTORATE_DEGREE"
class FieldMapping(BaseModel):
 path: str | None = None
 default: Any = None
 transform: TransformType | None = None
class ConnectorMapping(BaseModel):
 __root__: dict[str, str | FieldMapping]
class MappingConfig(BaseModel):
 __root__: dict[str, ConnectorMapping]
class JobMapping(BaseModel):
 WORKABLE: dict[str, Any] = {
 "title": "title",
 "description": {
 "path": "description",
 "default": "No description provided"
 },
 "salary_max": {
 "path": "salary.salary_to",
 "transform": TransformType.TO_DECIMAL
 },
 "salary_min": {
 "path": "salary.salary_from",
 "transform": TransformType.TO_DECIMAL
 },
 "salary_currency": "salary.salary_currency",
 "contract_type": {
 "path": "location.workplace_type",
 "transform": TransformType.TO_CONTRACT_TYPE
 },
 "salary_negotiable": {
 # Workable doesn't provide this information directly, so set a default
 "default": False
 },
 "responsibilities": "description",
 "department": "department",
 "url": "url",
 "application_url": "application_url",
 "source": {
 "default": "WORKABLE",
 "transform": TransformType.TO_JOB_SOURCE
 },
 "sector": {
 "path": "department", # Using department as a proxy for sector
 "transform": TransformType.TO_STR
 },
 "location": "location.location_str",
 "latlng": {
 "path": "location",
 "transform": TransformType.TO_STR
 },
 "scheduled_on_date": {
 "path": "created_at",
 "transform": TransformType.TO_DATETIME
 },
 "closing_on_date": {
 "default": None # Workable doesn't provide this directly
 },
 "published_on_date": {
 "path": "created_at", # Using created_at as a proxy for published date
 "transform": TransformType.TO_DATETIME
 },
 "external_reference_id": "id",
 }
class ExternalJob(BaseModel):
 title: str
 description: str | None = None
 salary_max: Decimal
 contract_type: JobContractType
 salary_negotiable: bool = False
 vacancy_count: int = 1
 status: JobStatus = JobStatus.ACTIVE
 responsibilities: str | None = None
 experience_level: JobExperienceLevel | None = None
 salary_min: Decimal | None = None
 education_level: JobEducationLevel | None = None
 department: str | None = None
 url: str | None = None
 source: JobSource
 sector: str | None = None
 location: str | None = None
 latlng: str | None = None
 scheduled_on_date: datetime | None = None
 closing_on_date: datetime | None = None
 published_on_date: datetime | None = None
 external_reference_id: str | None = None
 @classmethod
 def from_raw(cls, raw_data: dict[str, Any], mapping: dict[str, Any]):
 mapped_data = {}
 for field, field_mapping in mapping.items():
 if isinstance(field_mapping, str):
 mapped_data[field] = cls.get_nested_value(raw_data, field_mapping)
 elif isinstance(field_mapping, dict):
 value = (
 cls.get_nested_value(raw_data, field_mapping.get('path'))
 if field_mapping.get('path')
 else field_mapping.get('default')
 )
 if field_mapping.get('transform'):
 value = cls.apply_transform(value, field_mapping['transform'])
 mapped_data[field] = value
 return cls(**mapped_data)
 @staticmethod
 def get_nested_value(data: RawData, path: str | None) -> Any:
 if not path:
 return None
 keys = path.split('.')
 value = data
 for key in keys:
 if isinstance(value, dict):
 value = value.get(key, {})
 else:
 return None
 return value if value != {} else None
 @staticmethod
 def apply_transform(value: Any, transform: TransformType) -> Any | None:
 if value is None:
 return None
 match transform:
 case TransformType.TO_INT:
 return int(value)
 case TransformType.TO_FLOAT:
 return float(value)
 case TransformType.TO_STR:
 if isinstance(value, dict) and 'latitude' in value and 'longitude' in value:
 return f"{value.get('latitude', 0)},{value.get('longitude', 0)}"
 return str(value)
 case TransformType.TO_DECIMAL:
 return Decimal(str(value))
 case TransformType.TO_DATETIME:
 return datetime.fromisoformat(value.rstrip('Z'))
 case TransformType.TO_CONTRACT_TYPE:
 contract_mapping = {
 "full_time": JobContractType.FULL_TIME,
 "part_time": JobContractType.PART_TIME,
 "contract": JobContractType.CONTRACT
 }
 return contract_mapping.get(value.lower(), JobContractType.FULL_TIME)
 case TransformType.TO_JOB_SOURCE:
 return JobSource(value.upper())
 case _:
 return None

Entry Point

if __name__ == "__main__":
 workable_data = {
 "id": "1234",
 "title": "Software Engineer",
 "full_title": "Senior Software Engineer - Backend",
 "shortcode": "SE001",
 "state": "published",
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "application_url": "https://example.com/apply/se001",
 "shortlink": "https://wrbl.in/se001",
 "location": {
 "location_str": "New York, NY, United States",
 "country": "United States",
 "country_code": "US",
 "region": "New York",
 "region_code": "NY",
 "city": "New York",
 "zip_code": "10001",
 "telecommuting": False,
 "workplace_type": "on_site"
 },
 "salary": {
 "salary_from": 80000,
 "salary_to": 120000,
 "salary_currency": "USD"
 },
 "created_at": "2023年01月01日T00:00:00Z",
 "department_hierarchy": [
 {"id": 1, "name": "Technology"},
 {"id": 2, "name": "Engineering"}
 ]
 }
 mapping = JobMapping()
 workable_mapper = mapping.WORKABLE
 job_model = ExternalJob.from_raw(workable_data, workable_mapper)
 print(job_model.json(indent=2))

Result

{
 "title": "Software Engineer",
 "description": null,
 "salary_max": 120000,
 "contract_type": "FULL_TIME",
 "salary_negotiable": false,
 "vacancy_count": 1,
 "status": "ACTIVE",
 "responsibilities": null,
 "experience_level": null,
 "salary_min": 80000,
 "education_level": null,
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "source": "WORKABLE",
 "sector": "Engineering",
 "location": "New York, NY, United States",
 "latlng": "{'location_str': 'New York, NY, United States', 'country': 'United States', 'country_code': 'US', 'region': 'New York', 'region_code': 'NY', 'city': 'New York', 'zip_code': '10001', 'telecommuting': False, 'workplace_type': 'on_site'}",
 "scheduled_on_date": "2023年01月01日T00:00:00",
 "closing_on_date": null,
 "published_on_date": "2023年01月01日T00:00:00",
 "external_reference_id": "1234"
}

Self Analysis

  • from_raw & get_nested_value feel pretty complex - am I re-inventing the wheel? Surely there are methods I can call into via Pydantic.
  • apply_transform(...) smells. It feels like I should probably have a custom to_dict() instance method or a custom JSON encoder that based of json.JSONEncoder
  • No Doc-strings
  • No error handling. It doesn't adequately deal with scenarios where expected fields are missing or in unexpected formats.

References:

asked Sep 25, 2024 at 5:59
\$\endgroup\$

1 Answer 1

7
\$\begingroup\$

Are you sure it is not easier to just write a parser to parse the raw data? You will need to write some code for each api either way; why choose to write a mapping config and not just normal code?

You also lost the ability to process the data.

class ExternalJob(BaseModel): 
 title : str
 source : JobSource
 salary_max : Decimal
 contract_type : JobContractType
 vacancy_count : int = 1
 salary_negotiable : bool = False
 status : JobStatus = JobStatus.ACTIVE
 experience_level : Optional[JobExperienceLevel] = None
 salary_min : Optional[Decimal] = None
 education_level : Optional[JobEducationLevel] = None
 description : Optional[str] = None
 responsibilities : Optional[str] = None
 department : Optional[str] = None
 url : Optional[str] = None
 sector : Optional[str] = None
 location : Optional[str] = None
 latlng : Optional[str] = None
 external_reference_id : Optional[str] = None
 scheduled_on_date : Optional[datetime] = None
 closing_on_date : Optional[datetime] = None
 published_on_date : Optional[datetime] = None
class WorkableParser:
 contract_mapping = {
 "full_time" : JobContractType.FULL_TIME,
 "part_time" : JobContractType.PART_TIME,
 "contract" : JobContractType.CONTRACT
 }
 def GBP_to_USD(gbp: float) -> float:
 gbp * 1.34
 @classmethod
 def parse(cls, raw_data: RawData) -> ExternalJob:
 job = {}
 job["title"] = raw_data["title"]
 job["source"] = JobSource.WORKABLE
 job["salary_max"] = raw_data["salary"]["salary_to"]
 job["contract_type"] = cls.contract_mapping.get(raw_data["location"]["workplace_type"].lower(), JobContractType.FULL_TIME)
 # other fields . . .
 # .
 # .
 # .
 # What if you want to extract data from another field of the raw data
 if raw_data["location"]["country_code"] == "UK":
 job["salary_max"] = cls.GBP_to_USD(job["salary_max"])
 # What if you want to do some more complex data extraction 
 full_title = raw_data["full_title"].lower()
 if "senior" in full_title:
 job["experience_level"] = JobExperienceLevel.SENIOR
 elif "junior" in full_title:
 job["experience_level"] = JobExperienceLevel.JUNIOR
 elif "intern" in full_title:
 job["experience_level"] = JobExperienceLevel.INTERN
 return ExternalJob(**job)
if __name__ == "__main__":
 workable_data = {
 "id": "1234",
 "title": "Software Engineer",
 "full_title": "Senior Software Engineer - Backend",
 "shortcode": "SE001",
 "state": "published",
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "application_url": "https://example.com/apply/se001",
 "shortlink": "https://wrbl.in/se001",
 "location": {
 "location_str": "New York, NY, United States",
 "country": "United States",
 "country_code": "US",
 "region": "New York",
 "region_code": "NY",
 "city": "New York",
 "zip_code": "10001",
 "telecommuting": False,
 "workplace_type": "on_site"
 },
 "salary": {
 "salary_from": 80000,
 "salary_to": 120000,
 "salary_currency": "USD"
 },
 "created_at": "2023年01月01日T00:00:00Z",
 "department_hierarchy": [
 {"id": 1, "name": "Technology"},
 {"id": 2, "name": "Engineering"}
 ]
 }
 workable_job = WorkableParser.parse(workable_data)
 print(workable_job.model_dump_json(indent=2))
{
 "title": "Software Engineer",
 "source": "WORKABLE",
 "salary_max": "120000",
 "contract_type": "FULL_TIME",
 "vacancy_count": 1,
 "salary_negotiable": false,
 "status": "ACTIVE",
 "experience_level": "SENIOR",
 "salary_min": null,
 "education_level": null,
 "description": null,
 "responsibilities": null,
 "department": null,
 "url": null,
 "sector": null,
 "location": null,
 "latlng": null,
 "external_reference_id": null,
 "scheduled_on_date": null,
 "closing_on_date": null,
 "published_on_date": null
}
toolic
15.2k5 gold badges29 silver badges213 bronze badges
answered Sep 25, 2024 at 10:34
\$\endgroup\$
1
  • 1
    \$\begingroup\$ This is a good point. If I am already going to the trouble of writing a mapper, why not just maintain a parser. Thanks the new perspective. I'm marking this as the answer as its what I am going to use. \$\endgroup\$ Commented Sep 26, 2024 at 0:14

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.