PARTIAL FIX: Improve leading zeros preservation with dtype=str for dict-based dtypes #62242

Original file line number	Diff line number	Diff line change
Expand Up		@@ -987,6 +987,7 @@ I/O
		- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
		- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
		- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
	- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
		- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
		- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
		- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down

26 changes: 26 additions & 0 deletions pandas/io/parsers/arrow_parser_wrapper.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,6 +19,8 @@
		)
		from pandas.core.dtypes.inference import is_integer

	from pandas.core.arrays.arrow.array import to_pyarrow_type

		from pandas.io._util import arrow_table_to_pandas
		from pandas.io.parsers.base_parser import ParserBase

Expand Down Expand Up		@@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str:
		f"f{n}" for n in self.convert_options["include_columns"]
		]

	if self.dtype is not None:
	if isinstance(self.dtype, dict):
	column_types = {}
	for col, col_dtype in self.dtype.items():
	source_dtype = pandas_dtype(col_dtype)

	try:
	target_dtype = to_pyarrow_type(source_dtype.type)
	if target_dtype:
	column_types[col] = target_dtype

	except TypeError:
	# TODO: Unsupported dtypes silently ignored - may cause
	# unexpected behavior when pyarrow applies default inference
	# instead of user's dtype
	pass

	if column_types:
	self.convert_options["column_types"] = column_types
	else:
	# TODO: Global dtypes not supported - may cause inconsistent behavior
	# between engines, especially for leading zero preservation
	pass

		self.read_options = {
		"autogenerate_column_names": self.header is None,
		"skip_rows": self.header
Expand Down

71 changes: 71 additions & 0 deletions pandas/tests/io/parser/test_preserve_leading_zeros.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,71 @@
	from io import StringIO

	import pytest


	def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
Copy link Member @jbrockmendel jbrockmendel Sep 3, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. this doesn't merit its own file. try to find plausibly-related tests to put it with Copy link Author @dxdc dxdc Sep 3, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. It seemsl ike the `tests/io/parser` is the right place, but I don't see any other files there that seem appropriate. could you suggest another place? happy to move it.
	# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
	# GH#61618: further discussion on ensuring string dtype preservation across engines

	parser = all_parsers
	engine_name = getattr(parser, "engine", "unknown")

	data = """col1,col2,col3,col4
	AB,000388907,abc,0150
	CD,101044572,def,0150
	EF,000023607,ghi,0205
	GH,100102040,jkl,0205"""

	result = parser.read_csv(
	StringIO(data),
	dtype=str,
	)

	try:
	assert result.shape == (4, 4)
	assert list(result.columns) == ["col1", "col2", "col3", "col4"]

	assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
	assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
	assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
	assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"

	except AssertionError as exc:
	if engine_name == "pyarrow":
Copy link Member @jbrockmendel jbrockmendel Sep 3, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. take a look at how we handle xfails elsewhere. we check and add the marker before the meat of the test Copy link Author @dxdc dxdc Sep 3, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. @jbrockmendel I considered that option, but it doesn't seem appropriate for this case. The tests only fail for the pyarrow engine, and only because there is an underlying flaw in the pyarrow read logic. Is there another preferred way to handle this?
	# Temporary workaround for GH#57666
	# Remove once type preservation is fixed in pyarrow engine.
	request.node.add_marker(
	pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False)
	)
	raise


	def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
	# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
	# GH#61618: further discussion on ensuring string dtype preservation across engines

	parser = all_parsers

	data = """col1,col2,col3,col4
	AB,000388907,199,0150
	CD,101044572,200,0150
	EF,000023607,201,0205
	GH,100102040,202,0205"""

	result = parser.read_csv(
	StringIO(data),
	dtype={"col2": str, "col3": int, "col4": str},
	)

	assert result.shape == (4, 4)
	assert list(result.columns) == ["col1", "col2", "col3", "col4"]

	assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
	assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
	assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
	assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"

	assert result.loc[0, "col3"] == 199
	assert result.loc[1, "col3"] == 200
	assert result.loc[2, "col3"] == 201
	assert result.loc[3, "col3"] == 202

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

PARTIAL FIX: Improve leading zeros preservation with dtype=str for dict-based dtypes #62242

Are you sure you want to change the base?

Uh oh!

PARTIAL FIX: Improve leading zeros preservation with dtype=str for dict-based dtypes #62242

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

@jbrockmendel jbrockmendel Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

@dxdc dxdc Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

@jbrockmendel jbrockmendel Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

@dxdc dxdc Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!