Problem
The challenge is to create a flexible system for mapping job data from various external sources (such as Workable) into a standardised internal format. The system needed to handle:
- Diverse data structures from different job platforms
- Inconsistent field naming and value formats
- Conversion between different data types (e.g., strings to enums, timestamps to datetime objects)
- Mapping of varying terminology to standardized internal enums (e.g., job statuses, contract types)
Solution
I have tried to implement a configurable mapping system using Pydantic models, which allows for:
- Definition of source-specific data mappings
- Automatic data validation and type conversion
- Custom transformation functions for complex field mappings
- Extensibility to easily add new data sources or fields
Code
from enum import Enum
from typing import Any
from datetime import datetime
from decimal import Decimal
from pydantic import BaseModel, Field
RawData = dict[str, Any]
class TransformType(str, Enum):
TO_INT = "to_int"
TO_FLOAT = "to_float"
TO_STR = "to_str"
TO_DECIMAL = "to_decimal"
TO_DATETIME = "to_datetime"
TO_JOB_STATUS = "to_job_status"
TO_CONTRACT_TYPE = "to_contract_type"
TO_JOB_SOURCE = "to_job_source"
TO_EXPERIENCE_LEVEL = "to_experience_level"
TO_EDUCATION_LEVEL = "to_education_level"
class JobStatus(str, Enum):
SCHEDULED = "SCHEDULED"
ACTIVE = "ACTIVE"
CLOSED = "CLOSED"
ARCHIVED = "ARCHIVED"
class JobContractType(str, Enum):
FULL_TIME = "FULL_TIME"
PART_TIME = "PART_TIME"
CONTRACT = "CONTRACT"
class JobSource(str, Enum):
REGISTERED_ORG = "REGISTERED_ORG"
MANUALLY_ENTERED = "MANUALLY_ENTERED"
JOB_IMPORTER = "JOB_IMPORTER"
TEST_DATA = "TEST_DATA"
BREEZY_HR = "BREEZY_HR"
WORKABLE = "WORKABLE"
class JobExperienceLevel(str, Enum):
INTERN = "INTERN"
JUNIOR = "JUNIOR"
MID = "MID"
SENIOR = "SENIOR"
LEAD = "LEAD"
HEAD = "HEAD"
C_SUITE = "C_SUITE"
class JobEducationLevel(str, Enum):
NO_FORMAL_EDUCATION = "NO_FORMAL_EDUCATION"
GCSE = "GCSE"
A_LEVEL = "A_LEVELS"
FOUNDATION_DEGREE = "FOUNDATION_DEGREE"
BACHELOR_DEGREE = "BACHELOR_DEGREE"
POSTGRADUATE_DIPLOMA = "POSTGRADUATE_DIPLOMA"
MASTER_DEGREE = "MASTER_DEGREE"
DOCTORATE_DEGREE = "DOCTORATE_DEGREE"
class FieldMapping(BaseModel):
path: str | None = None
default: Any = None
transform: TransformType | None = None
class ConnectorMapping(BaseModel):
__root__: dict[str, str | FieldMapping]
class MappingConfig(BaseModel):
__root__: dict[str, ConnectorMapping]
class JobMapping(BaseModel):
WORKABLE: dict[str, Any] = {
"title": "title",
"description": {
"path": "description",
"default": "No description provided"
},
"salary_max": {
"path": "salary.salary_to",
"transform": TransformType.TO_DECIMAL
},
"salary_min": {
"path": "salary.salary_from",
"transform": TransformType.TO_DECIMAL
},
"salary_currency": "salary.salary_currency",
"contract_type": {
"path": "location.workplace_type",
"transform": TransformType.TO_CONTRACT_TYPE
},
"salary_negotiable": {
# Workable doesn't provide this information directly, so set a default
"default": False
},
"responsibilities": "description",
"department": "department",
"url": "url",
"application_url": "application_url",
"source": {
"default": "WORKABLE",
"transform": TransformType.TO_JOB_SOURCE
},
"sector": {
"path": "department", # Using department as a proxy for sector
"transform": TransformType.TO_STR
},
"location": "location.location_str",
"latlng": {
"path": "location",
"transform": TransformType.TO_STR
},
"scheduled_on_date": {
"path": "created_at",
"transform": TransformType.TO_DATETIME
},
"closing_on_date": {
"default": None # Workable doesn't provide this directly
},
"published_on_date": {
"path": "created_at", # Using created_at as a proxy for published date
"transform": TransformType.TO_DATETIME
},
"external_reference_id": "id",
}
class ExternalJob(BaseModel):
title: str
description: str | None = None
salary_max: Decimal
contract_type: JobContractType
salary_negotiable: bool = False
vacancy_count: int = 1
status: JobStatus = JobStatus.ACTIVE
responsibilities: str | None = None
experience_level: JobExperienceLevel | None = None
salary_min: Decimal | None = None
education_level: JobEducationLevel | None = None
department: str | None = None
url: str | None = None
source: JobSource
sector: str | None = None
location: str | None = None
latlng: str | None = None
scheduled_on_date: datetime | None = None
closing_on_date: datetime | None = None
published_on_date: datetime | None = None
external_reference_id: str | None = None
@classmethod
def from_raw(cls, raw_data: dict[str, Any], mapping: dict[str, Any]):
mapped_data = {}
for field, field_mapping in mapping.items():
if isinstance(field_mapping, str):
mapped_data[field] = cls.get_nested_value(raw_data, field_mapping)
elif isinstance(field_mapping, dict):
value = (
cls.get_nested_value(raw_data, field_mapping.get('path'))
if field_mapping.get('path')
else field_mapping.get('default')
)
if field_mapping.get('transform'):
value = cls.apply_transform(value, field_mapping['transform'])
mapped_data[field] = value
return cls(**mapped_data)
@staticmethod
def get_nested_value(data: RawData, path: str | None) -> Any:
if not path:
return None
keys = path.split('.')
value = data
for key in keys:
if isinstance(value, dict):
value = value.get(key, {})
else:
return None
return value if value != {} else None
@staticmethod
def apply_transform(value: Any, transform: TransformType) -> Any | None:
if value is None:
return None
match transform:
case TransformType.TO_INT:
return int(value)
case TransformType.TO_FLOAT:
return float(value)
case TransformType.TO_STR:
if isinstance(value, dict) and 'latitude' in value and 'longitude' in value:
return f"{value.get('latitude', 0)},{value.get('longitude', 0)}"
return str(value)
case TransformType.TO_DECIMAL:
return Decimal(str(value))
case TransformType.TO_DATETIME:
return datetime.fromisoformat(value.rstrip('Z'))
case TransformType.TO_CONTRACT_TYPE:
contract_mapping = {
"full_time": JobContractType.FULL_TIME,
"part_time": JobContractType.PART_TIME,
"contract": JobContractType.CONTRACT
}
return contract_mapping.get(value.lower(), JobContractType.FULL_TIME)
case TransformType.TO_JOB_SOURCE:
return JobSource(value.upper())
case _:
return None
Entry Point
if __name__ == "__main__":
workable_data = {
"id": "1234",
"title": "Software Engineer",
"full_title": "Senior Software Engineer - Backend",
"shortcode": "SE001",
"state": "published",
"department": "Engineering",
"url": "https://example.com/jobs/se001",
"application_url": "https://example.com/apply/se001",
"shortlink": "https://wrbl.in/se001",
"location": {
"location_str": "New York, NY, United States",
"country": "United States",
"country_code": "US",
"region": "New York",
"region_code": "NY",
"city": "New York",
"zip_code": "10001",
"telecommuting": False,
"workplace_type": "on_site"
},
"salary": {
"salary_from": 80000,
"salary_to": 120000,
"salary_currency": "USD"
},
"created_at": "2023年01月01日T00:00:00Z",
"department_hierarchy": [
{"id": 1, "name": "Technology"},
{"id": 2, "name": "Engineering"}
]
}
mapping = JobMapping()
workable_mapper = mapping.WORKABLE
job_model = ExternalJob.from_raw(workable_data, workable_mapper)
print(job_model.json(indent=2))
Result
{
"title": "Software Engineer",
"description": null,
"salary_max": 120000,
"contract_type": "FULL_TIME",
"salary_negotiable": false,
"vacancy_count": 1,
"status": "ACTIVE",
"responsibilities": null,
"experience_level": null,
"salary_min": 80000,
"education_level": null,
"department": "Engineering",
"url": "https://example.com/jobs/se001",
"source": "WORKABLE",
"sector": "Engineering",
"location": "New York, NY, United States",
"latlng": "{'location_str': 'New York, NY, United States', 'country': 'United States', 'country_code': 'US', 'region': 'New York', 'region_code': 'NY', 'city': 'New York', 'zip_code': '10001', 'telecommuting': False, 'workplace_type': 'on_site'}",
"scheduled_on_date": "2023年01月01日T00:00:00",
"closing_on_date": null,
"published_on_date": "2023年01月01日T00:00:00",
"external_reference_id": "1234"
}
Self Analysis
from_raw
&get_nested_value
feel pretty complex - am I re-inventing the wheel? Surely there are methods I can call into via Pydantic.apply_transform(...)
smells. It feels like I should probably have a customto_dict()
instance method or a custom JSON encoder that based ofjson.JSONEncoder
- No Doc-strings
- No error handling. It doesn't adequately deal with scenarios where expected fields are missing or in unexpected formats.
References:
1 Answer 1
Are you sure it is not easier to just write a parser to parse the raw data? You will need to write some code for each api either way; why choose to write a mapping config and not just normal code?
You also lost the ability to process the data.
class ExternalJob(BaseModel):
title : str
source : JobSource
salary_max : Decimal
contract_type : JobContractType
vacancy_count : int = 1
salary_negotiable : bool = False
status : JobStatus = JobStatus.ACTIVE
experience_level : Optional[JobExperienceLevel] = None
salary_min : Optional[Decimal] = None
education_level : Optional[JobEducationLevel] = None
description : Optional[str] = None
responsibilities : Optional[str] = None
department : Optional[str] = None
url : Optional[str] = None
sector : Optional[str] = None
location : Optional[str] = None
latlng : Optional[str] = None
external_reference_id : Optional[str] = None
scheduled_on_date : Optional[datetime] = None
closing_on_date : Optional[datetime] = None
published_on_date : Optional[datetime] = None
class WorkableParser:
contract_mapping = {
"full_time" : JobContractType.FULL_TIME,
"part_time" : JobContractType.PART_TIME,
"contract" : JobContractType.CONTRACT
}
def GBP_to_USD(gbp: float) -> float:
gbp * 1.34
@classmethod
def parse(cls, raw_data: RawData) -> ExternalJob:
job = {}
job["title"] = raw_data["title"]
job["source"] = JobSource.WORKABLE
job["salary_max"] = raw_data["salary"]["salary_to"]
job["contract_type"] = cls.contract_mapping.get(raw_data["location"]["workplace_type"].lower(), JobContractType.FULL_TIME)
# other fields . . .
# .
# .
# .
# What if you want to extract data from another field of the raw data
if raw_data["location"]["country_code"] == "UK":
job["salary_max"] = cls.GBP_to_USD(job["salary_max"])
# What if you want to do some more complex data extraction
full_title = raw_data["full_title"].lower()
if "senior" in full_title:
job["experience_level"] = JobExperienceLevel.SENIOR
elif "junior" in full_title:
job["experience_level"] = JobExperienceLevel.JUNIOR
elif "intern" in full_title:
job["experience_level"] = JobExperienceLevel.INTERN
return ExternalJob(**job)
if __name__ == "__main__":
workable_data = {
"id": "1234",
"title": "Software Engineer",
"full_title": "Senior Software Engineer - Backend",
"shortcode": "SE001",
"state": "published",
"department": "Engineering",
"url": "https://example.com/jobs/se001",
"application_url": "https://example.com/apply/se001",
"shortlink": "https://wrbl.in/se001",
"location": {
"location_str": "New York, NY, United States",
"country": "United States",
"country_code": "US",
"region": "New York",
"region_code": "NY",
"city": "New York",
"zip_code": "10001",
"telecommuting": False,
"workplace_type": "on_site"
},
"salary": {
"salary_from": 80000,
"salary_to": 120000,
"salary_currency": "USD"
},
"created_at": "2023年01月01日T00:00:00Z",
"department_hierarchy": [
{"id": 1, "name": "Technology"},
{"id": 2, "name": "Engineering"}
]
}
workable_job = WorkableParser.parse(workable_data)
print(workable_job.model_dump_json(indent=2))
{
"title": "Software Engineer",
"source": "WORKABLE",
"salary_max": "120000",
"contract_type": "FULL_TIME",
"vacancy_count": 1,
"salary_negotiable": false,
"status": "ACTIVE",
"experience_level": "SENIOR",
"salary_min": null,
"education_level": null,
"description": null,
"responsibilities": null,
"department": null,
"url": null,
"sector": null,
"location": null,
"latlng": null,
"external_reference_id": null,
"scheduled_on_date": null,
"closing_on_date": null,
"published_on_date": null
}
-
1\$\begingroup\$ This is a good point. If I am already going to the trouble of writing a mapper, why not just maintain a parser. Thanks the new perspective. I'm marking this as the answer as its what I am going to use. \$\endgroup\$Bob– Bob2024年09月26日 00:14:33 +00:00Commented Sep 26, 2024 at 0:14