A generic mapping class targeting many job source

Question 1

Problem

The challenge is to create a flexible system for mapping job data from various external sources (such as Workable) into a standardised internal format. The system needed to handle:

Diverse data structures from different job platforms
Inconsistent field naming and value formats
Conversion between different data types (e.g., strings to enums, timestamps to datetime objects)
Mapping of varying terminology to standardized internal enums (e.g., job statuses, contract types)

Solution

I have tried to implement a configurable mapping system using Pydantic models, which allows for:

Definition of source-specific data mappings
Automatic data validation and type conversion
Custom transformation functions for complex field mappings
Extensibility to easily add new data sources or fields

Code

from enum import Enum
from typing import Any
from datetime import datetime
from decimal import Decimal
from pydantic import BaseModel, Field
RawData = dict[str, Any]
class TransformType(str, Enum):
 TO_INT = "to_int"
 TO_FLOAT = "to_float"
 TO_STR = "to_str"
 TO_DECIMAL = "to_decimal"
 TO_DATETIME = "to_datetime"
 TO_JOB_STATUS = "to_job_status"
 TO_CONTRACT_TYPE = "to_contract_type"
 TO_JOB_SOURCE = "to_job_source"
 TO_EXPERIENCE_LEVEL = "to_experience_level"
 TO_EDUCATION_LEVEL = "to_education_level"
class JobStatus(str, Enum):
 SCHEDULED = "SCHEDULED"
 ACTIVE = "ACTIVE"
 CLOSED = "CLOSED"
 ARCHIVED = "ARCHIVED"
class JobContractType(str, Enum):
 FULL_TIME = "FULL_TIME"
 PART_TIME = "PART_TIME"
 CONTRACT = "CONTRACT"
class JobSource(str, Enum):
 REGISTERED_ORG = "REGISTERED_ORG"
 MANUALLY_ENTERED = "MANUALLY_ENTERED"
 JOB_IMPORTER = "JOB_IMPORTER"
 TEST_DATA = "TEST_DATA"
 BREEZY_HR = "BREEZY_HR"
 WORKABLE = "WORKABLE"
class JobExperienceLevel(str, Enum):
 INTERN = "INTERN"
 JUNIOR = "JUNIOR"
 MID = "MID"
 SENIOR = "SENIOR"
 LEAD = "LEAD"
 HEAD = "HEAD"
 C_SUITE = "C_SUITE"
class JobEducationLevel(str, Enum):
 NO_FORMAL_EDUCATION = "NO_FORMAL_EDUCATION"
 GCSE = "GCSE"
 A_LEVEL = "A_LEVELS"
 FOUNDATION_DEGREE = "FOUNDATION_DEGREE"
 BACHELOR_DEGREE = "BACHELOR_DEGREE"
 POSTGRADUATE_DIPLOMA = "POSTGRADUATE_DIPLOMA"
 MASTER_DEGREE = "MASTER_DEGREE"
 DOCTORATE_DEGREE = "DOCTORATE_DEGREE"
class FieldMapping(BaseModel):
 path: str | None = None
 default: Any = None
 transform: TransformType | None = None
class ConnectorMapping(BaseModel):
 __root__: dict[str, str | FieldMapping]
class MappingConfig(BaseModel):
 __root__: dict[str, ConnectorMapping]
class JobMapping(BaseModel):
 WORKABLE: dict[str, Any] = {
 "title": "title",
 "description": {
 "path": "description",
 "default": "No description provided"
 },
 "salary_max": {
 "path": "salary.salary_to",
 "transform": TransformType.TO_DECIMAL
 },
 "salary_min": {
 "path": "salary.salary_from",
 "transform": TransformType.TO_DECIMAL
 },
 "salary_currency": "salary.salary_currency",
 "contract_type": {
 "path": "location.workplace_type",
 "transform": TransformType.TO_CONTRACT_TYPE
 },
 "salary_negotiable": {
 # Workable doesn't provide this information directly, so set a default
 "default": False
 },
 "responsibilities": "description",
 "department": "department",
 "url": "url",
 "application_url": "application_url",
 "source": {
 "default": "WORKABLE",
 "transform": TransformType.TO_JOB_SOURCE
 },
 "sector": {
 "path": "department", # Using department as a proxy for sector
 "transform": TransformType.TO_STR
 },
 "location": "location.location_str",
 "latlng": {
 "path": "location",
 "transform": TransformType.TO_STR
 },
 "scheduled_on_date": {
 "path": "created_at",
 "transform": TransformType.TO_DATETIME
 },
 "closing_on_date": {
 "default": None # Workable doesn't provide this directly
 },
 "published_on_date": {
 "path": "created_at", # Using created_at as a proxy for published date
 "transform": TransformType.TO_DATETIME
 },
 "external_reference_id": "id",
 }
class ExternalJob(BaseModel):
 title: str
 description: str | None = None
 salary_max: Decimal
 contract_type: JobContractType
 salary_negotiable: bool = False
 vacancy_count: int = 1
 status: JobStatus = JobStatus.ACTIVE
 responsibilities: str | None = None
 experience_level: JobExperienceLevel | None = None
 salary_min: Decimal | None = None
 education_level: JobEducationLevel | None = None
 department: str | None = None
 url: str | None = None
 source: JobSource
 sector: str | None = None
 location: str | None = None
 latlng: str | None = None
 scheduled_on_date: datetime | None = None
 closing_on_date: datetime | None = None
 published_on_date: datetime | None = None
 external_reference_id: str | None = None
 @classmethod
 def from_raw(cls, raw_data: dict[str, Any], mapping: dict[str, Any]):
 mapped_data = {}
 for field, field_mapping in mapping.items():
 if isinstance(field_mapping, str):
 mapped_data[field] = cls.get_nested_value(raw_data, field_mapping)
 elif isinstance(field_mapping, dict):
 value = (
 cls.get_nested_value(raw_data, field_mapping.get('path'))
 if field_mapping.get('path')
 else field_mapping.get('default')
 )
 if field_mapping.get('transform'):
 value = cls.apply_transform(value, field_mapping['transform'])
 mapped_data[field] = value
 return cls(**mapped_data)
 @staticmethod
 def get_nested_value(data: RawData, path: str | None) -> Any:
 if not path:
 return None
 keys = path.split('.')
 value = data
 for key in keys:
 if isinstance(value, dict):
 value = value.get(key, {})
 else:
 return None
 return value if value != {} else None
 @staticmethod
 def apply_transform(value: Any, transform: TransformType) -> Any | None:
 if value is None:
 return None
 match transform:
 case TransformType.TO_INT:
 return int(value)
 case TransformType.TO_FLOAT:
 return float(value)
 case TransformType.TO_STR:
 if isinstance(value, dict) and 'latitude' in value and 'longitude' in value:
 return f"{value.get('latitude', 0)},{value.get('longitude', 0)}"
 return str(value)
 case TransformType.TO_DECIMAL:
 return Decimal(str(value))
 case TransformType.TO_DATETIME:
 return datetime.fromisoformat(value.rstrip('Z'))
 case TransformType.TO_CONTRACT_TYPE:
 contract_mapping = {
 "full_time": JobContractType.FULL_TIME,
 "part_time": JobContractType.PART_TIME,
 "contract": JobContractType.CONTRACT
 }
 return contract_mapping.get(value.lower(), JobContractType.FULL_TIME)
 case TransformType.TO_JOB_SOURCE:
 return JobSource(value.upper())
 case _:
 return None

Entry Point

if __name__ == "__main__":
 workable_data = {
 "id": "1234",
 "title": "Software Engineer",
 "full_title": "Senior Software Engineer - Backend",
 "shortcode": "SE001",
 "state": "published",
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "application_url": "https://example.com/apply/se001",
 "shortlink": "https://wrbl.in/se001",
 "location": {
 "location_str": "New York, NY, United States",
 "country": "United States",
 "country_code": "US",
 "region": "New York",
 "region_code": "NY",
 "city": "New York",
 "zip_code": "10001",
 "telecommuting": False,
 "workplace_type": "on_site"
 },
 "salary": {
 "salary_from": 80000,
 "salary_to": 120000,
 "salary_currency": "USD"
 },
 "created_at": "2023年01月01日T00:00:00Z",
 "department_hierarchy": [
 {"id": 1, "name": "Technology"},
 {"id": 2, "name": "Engineering"}
 ]
 }
 mapping = JobMapping()
 workable_mapper = mapping.WORKABLE
 job_model = ExternalJob.from_raw(workable_data, workable_mapper)
 print(job_model.json(indent=2))

Result

{
 "title": "Software Engineer",
 "description": null,
 "salary_max": 120000,
 "contract_type": "FULL_TIME",
 "salary_negotiable": false,
 "vacancy_count": 1,
 "status": "ACTIVE",
 "responsibilities": null,
 "experience_level": null,
 "salary_min": 80000,
 "education_level": null,
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "source": "WORKABLE",
 "sector": "Engineering",
 "location": "New York, NY, United States",
 "latlng": "{'location_str': 'New York, NY, United States', 'country': 'United States', 'country_code': 'US', 'region': 'New York', 'region_code': 'NY', 'city': 'New York', 'zip_code': '10001', 'telecommuting': False, 'workplace_type': 'on_site'}",
 "scheduled_on_date": "2023年01月01日T00:00:00",
 "closing_on_date": null,
 "published_on_date": "2023年01月01日T00:00:00",
 "external_reference_id": "1234"
}

Self Analysis

from_raw & get_nested_value feel pretty complex - am I re-inventing the wheel? Surely there are methods I can call into via Pydantic.
apply_transform(...) smells. It feels like I should probably have a custom to_dict() instance method or a custom JSON encoder that based of json.JSONEncoder
No Doc-strings
No error handling. It doesn't adequately deal with scenarios where expected fields are missing or in unexpected formats.

References:

Question 2

Are you sure it is not easier to just write a parser to parse the raw data? You will need to write some code for each api either way; why choose to write a mapping config and not just normal code?

You also lost the ability to process the data.

class ExternalJob(BaseModel): 
 title : str
 source : JobSource
 salary_max : Decimal
 contract_type : JobContractType
 vacancy_count : int = 1
 salary_negotiable : bool = False
 status : JobStatus = JobStatus.ACTIVE
 experience_level : Optional[JobExperienceLevel] = None
 salary_min : Optional[Decimal] = None
 education_level : Optional[JobEducationLevel] = None
 description : Optional[str] = None
 responsibilities : Optional[str] = None
 department : Optional[str] = None
 url : Optional[str] = None
 sector : Optional[str] = None
 location : Optional[str] = None
 latlng : Optional[str] = None
 external_reference_id : Optional[str] = None
 scheduled_on_date : Optional[datetime] = None
 closing_on_date : Optional[datetime] = None
 published_on_date : Optional[datetime] = None
class WorkableParser:
 contract_mapping = {
 "full_time" : JobContractType.FULL_TIME,
 "part_time" : JobContractType.PART_TIME,
 "contract" : JobContractType.CONTRACT
 }
 def GBP_to_USD(gbp: float) -> float:
 gbp * 1.34
 @classmethod
 def parse(cls, raw_data: RawData) -> ExternalJob:
 job = {}
 job["title"] = raw_data["title"]
 job["source"] = JobSource.WORKABLE
 job["salary_max"] = raw_data["salary"]["salary_to"]
 job["contract_type"] = cls.contract_mapping.get(raw_data["location"]["workplace_type"].lower(), JobContractType.FULL_TIME)
 # other fields . . .
 # .
 # .
 # .
 # What if you want to extract data from another field of the raw data
 if raw_data["location"]["country_code"] == "UK":
 job["salary_max"] = cls.GBP_to_USD(job["salary_max"])
 # What if you want to do some more complex data extraction 
 full_title = raw_data["full_title"].lower()
 if "senior" in full_title:
 job["experience_level"] = JobExperienceLevel.SENIOR
 elif "junior" in full_title:
 job["experience_level"] = JobExperienceLevel.JUNIOR
 elif "intern" in full_title:
 job["experience_level"] = JobExperienceLevel.INTERN
 return ExternalJob(**job)
if __name__ == "__main__":
 workable_data = {
 "id": "1234",
 "title": "Software Engineer",
 "full_title": "Senior Software Engineer - Backend",
 "shortcode": "SE001",
 "state": "published",
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "application_url": "https://example.com/apply/se001",
 "shortlink": "https://wrbl.in/se001",
 "location": {
 "location_str": "New York, NY, United States",
 "country": "United States",
 "country_code": "US",
 "region": "New York",
 "region_code": "NY",
 "city": "New York",
 "zip_code": "10001",
 "telecommuting": False,
 "workplace_type": "on_site"
 },
 "salary": {
 "salary_from": 80000,
 "salary_to": 120000,
 "salary_currency": "USD"
 },
 "created_at": "2023年01月01日T00:00:00Z",
 "department_hierarchy": [
 {"id": 1, "name": "Technology"},
 {"id": 2, "name": "Engineering"}
 ]
 }
 workable_job = WorkableParser.parse(workable_data)
 print(workable_job.model_dump_json(indent=2))

{
 "title": "Software Engineer",
 "source": "WORKABLE",
 "salary_max": "120000",
 "contract_type": "FULL_TIME",
 "vacancy_count": 1,
 "salary_negotiable": false,
 "status": "ACTIVE",
 "experience_level": "SENIOR",
 "salary_min": null,
 "education_level": null,
 "description": null,
 "responsibilities": null,
 "department": null,
 "url": null,
 "sector": null,
 "location": null,
 "latlng": null,
 "external_reference_id": null,
 "scheduled_on_date": null,
 "closing_on_date": null,
 "published_on_date": null
}

Question 3

This is a good point. If I am already going to the trouble of writing a mapper, why not just maintain a parser. Thanks the new perspective. I'm marking this as the answer as its what I am going to use.

啊鹿Dizzyi 啊鹿Dizzyi 3111 silver badge2 bronze badges · Accepted Answer · 2024-09-25 10:34:35Z

Are you sure it is not easier to just write a parser to parse the raw data? You will need to write some code for each api either way; why choose to write a mapping config and not just normal code?

You also lost the ability to process the data.

class ExternalJob(BaseModel): 
 title : str
 source : JobSource
 salary_max : Decimal
 contract_type : JobContractType
 vacancy_count : int = 1
 salary_negotiable : bool = False
 status : JobStatus = JobStatus.ACTIVE
 experience_level : Optional[JobExperienceLevel] = None
 salary_min : Optional[Decimal] = None
 education_level : Optional[JobEducationLevel] = None
 description : Optional[str] = None
 responsibilities : Optional[str] = None
 department : Optional[str] = None
 url : Optional[str] = None
 sector : Optional[str] = None
 location : Optional[str] = None
 latlng : Optional[str] = None
 external_reference_id : Optional[str] = None
 scheduled_on_date : Optional[datetime] = None
 closing_on_date : Optional[datetime] = None
 published_on_date : Optional[datetime] = None
class WorkableParser:
 contract_mapping = {
 "full_time" : JobContractType.FULL_TIME,
 "part_time" : JobContractType.PART_TIME,
 "contract" : JobContractType.CONTRACT
 }
 def GBP_to_USD(gbp: float) -> float:
 gbp * 1.34
 @classmethod
 def parse(cls, raw_data: RawData) -> ExternalJob:
 job = {}
 job["title"] = raw_data["title"]
 job["source"] = JobSource.WORKABLE
 job["salary_max"] = raw_data["salary"]["salary_to"]
 job["contract_type"] = cls.contract_mapping.get(raw_data["location"]["workplace_type"].lower(), JobContractType.FULL_TIME)
 # other fields . . .
 # .
 # .
 # .
 # What if you want to extract data from another field of the raw data
 if raw_data["location"]["country_code"] == "UK":
 job["salary_max"] = cls.GBP_to_USD(job["salary_max"])
 # What if you want to do some more complex data extraction 
 full_title = raw_data["full_title"].lower()
 if "senior" in full_title:
 job["experience_level"] = JobExperienceLevel.SENIOR
 elif "junior" in full_title:
 job["experience_level"] = JobExperienceLevel.JUNIOR
 elif "intern" in full_title:
 job["experience_level"] = JobExperienceLevel.INTERN
 return ExternalJob(**job)
if __name__ == "__main__":
 workable_data = {
 "id": "1234",
 "title": "Software Engineer",
 "full_title": "Senior Software Engineer - Backend",
 "shortcode": "SE001",
 "state": "published",
 "department": "Engineering",
 "url": "https://example.com/jobs/se001",
 "application_url": "https://example.com/apply/se001",
 "shortlink": "https://wrbl.in/se001",
 "location": {
 "location_str": "New York, NY, United States",
 "country": "United States",
 "country_code": "US",
 "region": "New York",
 "region_code": "NY",
 "city": "New York",
 "zip_code": "10001",
 "telecommuting": False,
 "workplace_type": "on_site"
 },
 "salary": {
 "salary_from": 80000,
 "salary_to": 120000,
 "salary_currency": "USD"
 },
 "created_at": "2023年01月01日T00:00:00Z",
 "department_hierarchy": [
 {"id": 1, "name": "Technology"},
 {"id": 2, "name": "Engineering"}
 ]
 }
 workable_job = WorkableParser.parse(workable_data)
 print(workable_job.model_dump_json(indent=2))

{
 "title": "Software Engineer",
 "source": "WORKABLE",
 "salary_max": "120000",
 "contract_type": "FULL_TIME",
 "vacancy_count": 1,
 "salary_negotiable": false,
 "status": "ACTIVE",
 "experience_level": "SENIOR",
 "salary_min": null,
 "education_level": null,
 "description": null,
 "responsibilities": null,
 "department": null,
 "url": null,
 "sector": null,
 "location": null,
 "latlng": null,
 "external_reference_id": null,
 "scheduled_on_date": null,
 "closing_on_date": null,
 "published_on_date": null
}

This is a good point. If I am already going to the trouble of writing a mapper, why not just maintain a parser. Thanks the new perspective. I'm marking this as the answer as its what I am going to use.

Stack Exchange Network

A generic mapping class targeting many job source

Problem

Solution

Code

Entry Point

Result

Self Analysis

References:

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

A generic mapping class targeting many job source

Problem

Solution

Code

Entry Point

Result

Self Analysis

References:

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions