diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5671273..0ad3df6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,7 +24,7 @@ jobs: - name: Install package run: | python -m pip install --upgrade pip - pip install -e . + pip install -e ".[pandas]" - name: Run tests run: python -m unittest discover -s tests -p 'test*.py' -v diff --git a/nsv/__init__.py b/nsv/__init__.py index 46d7124..be6d222 100644 --- a/nsv/__init__.py +++ b/nsv/__init__.py @@ -12,18 +12,41 @@ def patch_pandas(): if 'pandas' not in sys.modules: return pd = sys.modules['pandas'] + from pandas.io.parsers.readers import STR_NA_VALUES - def read_nsv(filepath_or_buffer, **kwargs): + bool_values = frozenset({'true', 'false'}) + + def _infer_column(col): + na_mask = col.isin(STR_NA_VALUES) + col_na = col.where(~na_mask) + + converted = pd.to_numeric(col_na, errors='coerce') + if not (converted.isna() & col_na.notna()).any(): + return converted + + non_na = col_na.dropna() + if len(non_na) > 0 and non_na.str.lower().isin(bool_values).all(): + as_bool = col_na.map(lambda x: x.lower() == 'true' if pd.notna(x) else x) + return as_bool if na_mask.any() else as_bool.astype(bool) + + return col_na + + def read_nsv(filepath_or_buffer, dtype=None, **kwargs): if isinstance(filepath_or_buffer, str): with open(filepath_or_buffer, 'r') as f: data = load(f) else: data = load(filepath_or_buffer) - return pd.DataFrame(data) + df = pd.DataFrame(data) + if dtype is not None: + df = df.astype(dtype) + else: + for col in df.columns: + df[col] = _infer_column(df[col]) + return df def to_nsv(self, path_or_buf=None, **kwargs): - # TODO: this is naive, pandas can have non-string values - data = self.values + data = [['' if pd.isna(v) else str(v) for v in row] for row in self.values] if path_or_buf is None: return dumps(data) diff --git a/nsv/core.py b/nsv/core.py index 72a4be2..7dc6214 100644 --- a/nsv/core.py +++ b/nsv/core.py @@ -30,7 +30,7 @@ def dump(data: Iterable[Iterable[str]], file_obj): def dumps(data: Iterable[Iterable[str]]) -> str: """Write elements to an NSV string.""" lines = [] - for i, row in enumerate(data): + for row in data: for cell in row: lines.append(Writer.escape(cell)) lines.append('') diff --git a/nsv/reader.py b/nsv/reader.py index 173239d..76e0345 100644 --- a/nsv/reader.py +++ b/nsv/reader.py @@ -63,7 +63,7 @@ def check(s: str): else: col += 1 if escaped: - sus.append(len(s) - 1) + sus.append((len(s) - 1, line, col)) for pos, line, col in sus: print(f'WARNING: Unescaped backslash at position {pos} ({line}:{col})') if s[-1] != '\n': diff --git a/tests/test_pandas.py b/tests/test_pandas.py new file mode 100644 index 0000000..ee2c628 --- /dev/null +++ b/tests/test_pandas.py @@ -0,0 +1,173 @@ +import unittest +from io import StringIO + +import pandas as pd +import numpy as np + +import nsv + + +def setUpModule(): + nsv.patch_pandas() + + +class TestReadNsvTypeInference(unittest.TestCase): + """read_nsv should infer types the same way read_csv does.""" + + def _compare_with_csv(self, rows): + """Assert that read_nsv produces the same dtypes and values as read_csv.""" + nsv_str = nsv.dumps(rows) + csv_str = '\n'.join(','.join(row) for row in rows) + '\n' + + nsv_df = pd.read_nsv(StringIO(nsv_str)) + csv_df = pd.read_csv(StringIO(csv_str), header=None) + + self.assertEqual(list(nsv_df.dtypes), list(csv_df.dtypes), + f"dtype mismatch for rows={rows}") + pd.testing.assert_frame_equal(nsv_df, csv_df) + + def test_integers(self): + self._compare_with_csv([['1', '2'], ['3', '4']]) + + def test_floats(self): + self._compare_with_csv([['1.5', '2.5'], ['3.5', '4.5']]) + + def test_mixed_int_float(self): + self._compare_with_csv([['1', '2.5'], ['3', '4.5']]) + + def test_strings(self): + self._compare_with_csv([['hello', 'world'], ['foo', 'bar']]) + + def test_mixed_numeric_and_string(self): + self._compare_with_csv([['123', 'abc'], ['456', 'def']]) + + def test_empty_fields_in_numeric_column(self): + self._compare_with_csv([['1', 'a'], ['', 'b'], ['3', 'c']]) + + def test_scientific_notation(self): + self._compare_with_csv([['1.23e5', '4.56e-2'], ['7.89e1', '0.12e3']]) + + def test_negative_numbers(self): + self._compare_with_csv([['-1', '-2.5'], ['3', '4.5']]) + + def test_all_empty(self): + self._compare_with_csv([['', ''], ['', '']]) + + +class TestReadNsvNullInference(unittest.TestCase): + """read_nsv should treat the same strings as NaN that read_csv does.""" + + def _compare_with_csv(self, rows): + nsv_str = nsv.dumps(rows) + csv_str = '\n'.join(','.join(row) for row in rows) + '\n' + nsv_df = pd.read_nsv(StringIO(nsv_str)) + csv_df = pd.read_csv(StringIO(csv_str), header=None) + self.assertEqual(list(nsv_df.dtypes), list(csv_df.dtypes), + f"dtype mismatch for rows={rows}") + pd.testing.assert_frame_equal(nsv_df, csv_df) + + def test_na_string_in_numeric_column(self): + self._compare_with_csv([['NA', '1'], ['2', '3']]) + + def test_nan_string_in_numeric_column(self): + self._compare_with_csv([['NaN', '1'], ['2', '3']]) + + def test_nan_lowercase_in_numeric_column(self): + self._compare_with_csv([['nan', '1'], ['2', '3']]) + + def test_null_string_in_numeric_column(self): + self._compare_with_csv([['null', '1'], ['2', '3']]) + + def test_none_string_in_numeric_column(self): + self._compare_with_csv([['None', '1'], ['2', '3']]) + + def test_na_string_in_string_column(self): + self._compare_with_csv([['hello', 'NA'], ['world', 'there']]) + + def test_all_na_column(self): + self._compare_with_csv([['NA', 'a'], ['NaN', 'b'], ['null', 'c']]) + + +class TestReadNsvBoolInference(unittest.TestCase): + """read_nsv should infer bool columns the same way read_csv does.""" + + def _compare_with_csv(self, rows): + nsv_str = nsv.dumps(rows) + csv_str = '\n'.join(','.join(row) for row in rows) + '\n' + nsv_df = pd.read_nsv(StringIO(nsv_str)) + csv_df = pd.read_csv(StringIO(csv_str), header=None) + self.assertEqual(list(nsv_df.dtypes), list(csv_df.dtypes), + f"dtype mismatch for rows={rows}") + pd.testing.assert_frame_equal(nsv_df, csv_df) + + def test_bool_true_false(self): + self._compare_with_csv([['True', 'False'], ['True', 'False']]) + + def test_bool_lowercase(self): + self._compare_with_csv([['true', 'false'], ['true', 'false']]) + + def test_bool_uppercase(self): + self._compare_with_csv([['TRUE', 'FALSE'], ['TRUE', 'FALSE']]) + + def test_bool_mixed_case(self): + self._compare_with_csv([['True', 'false'], ['FALSE', 'True']]) + + def test_bool_with_na(self): + # NA mixed in: read_csv returns object with Python bools and nan + self._compare_with_csv([['True', 'a'], ['NA', 'b'], ['False', 'c']]) + + def test_not_bool_T_F(self): + # 'T'/'F' are NOT inferred as bool by read_csv + self._compare_with_csv([['T', 'F'], ['T', 'F']]) + + +class TestReadNsvDtype(unittest.TestCase): + """read_nsv should support explicit dtype parameter.""" + + def test_dtype_str_suppresses_inference(self): + data = [['123', '456'], ['789', '012']] + nsv_str = nsv.dumps(data) + df = pd.read_nsv(StringIO(nsv_str), dtype=str) + for col in df.columns: + self.assertFalse(pd.api.types.is_numeric_dtype(df[col])) + self.assertEqual(df.iloc[0, 0], '123') + + def test_dtype_per_column(self): + data = [['123', '4.5'], ['789', '6.7']] + nsv_str = nsv.dumps(data) + df = pd.read_nsv(StringIO(nsv_str), dtype={0: float, 1: str}) + self.assertTrue(pd.api.types.is_float_dtype(df[0])) + self.assertFalse(pd.api.types.is_numeric_dtype(df[1])) + + +class TestToNsv(unittest.TestCase): + """to_nsv should handle non-string types gracefully.""" + + def test_roundtrip_integers(self): + df = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]}) + nsv_str = df.to_nsv() + self.assertIsInstance(nsv_str, str) + df2 = pd.read_nsv(StringIO(nsv_str)) + pd.testing.assert_frame_equal(df, df2) + + def test_roundtrip_floats(self): + df = pd.DataFrame({0: [1.5, 2.5], 1: [3.5, 4.5]}) + nsv_str = df.to_nsv() + df2 = pd.read_nsv(StringIO(nsv_str)) + pd.testing.assert_frame_equal(df, df2) + + def test_roundtrip_mixed(self): + df = pd.DataFrame({0: [1, 2], 1: ['x', 'y']}) + nsv_str = df.to_nsv() + df2 = pd.read_nsv(StringIO(nsv_str)) + pd.testing.assert_frame_equal(df, df2) + + def test_nan_becomes_empty(self): + df = pd.DataFrame({'a': [1.0, float('nan'), 3.0]}) + nsv_str = df.to_nsv() + rows = nsv.loads(nsv_str) + self.assertEqual(rows[1], ['']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py index 0237151..802f130 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -42,7 +42,7 @@ def dump_then_load(data): def load_then_dump(s): - return nsv.dumps(*nsv.loads(s)) + return nsv.dumps(nsv.loads(s)) def load_sample(name):