From de8678c98b4a7b165ca2cefc926d932c0e87c48d Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Fri, 29 Mar 2024 12:52:53 +0100 Subject: [PATCH 1/6] init --- pdtable/io/parsers/blocks.py | 37 ++++++++++++++++++++++++--- pdtable/test/io/input/only_tables.csv | 13 ++++++++++ pdtable/test/io/test_csv.py | 14 ++++++++++ 3 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 pdtable/test/io/input/only_tables.csv diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index b45ca34..70354fd 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -24,11 +24,9 @@ - The original, raw cell grid, in case the user wants to do some low-level processing. """ -from abc import abstractmethod import itertools import re from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict -from collections import defaultdict import pandas as pd import warnings @@ -39,7 +37,6 @@ LocationSheet, NullLocationFile, TableOrigin, - InputIssue, InputIssueTracker, NullInputIssueTracker, ) @@ -49,6 +46,37 @@ from ...auxiliary import MetadataBlock, Directive from ...table_metadata import TableMetadata + +class EncodingException(Exception): + pass + + +def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]: + """ + CSV file can have a BOM character at the start. + Reading file with a default encoding does not raise an issue, + but in such a case we ignore the first line + (and the whole table if the file starts with a table block). + This function checks if we loaded the file content with a correct encoding + and raise an EncodingException if not. + """ + first_cell_row = next(cell_rows) + + if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0: + first_sign = first_cell_row[0][0] + + try: + first_sign.encode("ascii") + except UnicodeEncodeError: + raise EncodingException( + f'File starts with no ascii character "{first_sign}". ' + 'Please verify the file encoding.' + ) + + yield first_cell_row + yield from cell_rows + + # Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col] CellGrid = Sequence[Sequence] @@ -451,7 +479,8 @@ def block_output(block_type, cell_grid, row: int): state = BlockType.METADATA next_state = None this_block_1st_row = 0 - for row_number_0based, row in enumerate(cell_rows): + + for row_number_0based, row in enumerate(check_encoding(cell_rows)): if row is None or len(row) == 0 or _is_cell_blank(row[0]): if state != BlockType.BLANK: next_state = BlockType.BLANK diff --git a/pdtable/test/io/input/only_tables.csv b/pdtable/test/io/input/only_tables.csv new file mode 100644 index 0000000..1e485e6 --- /dev/null +++ b/pdtable/test/io/input/only_tables.csv @@ -0,0 +1,13 @@ +**generic_inf;;;;;;;;;;;;; +all;;;;;;;;;;;;; +FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type +text;text;text;text;text;text;text;-;-;mm;text;text;text;- +C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1 +;;;;;;;;;;;;; +;;;;;;;;;;;;; +**generic_inf_constraints;;;;;;;;;;;;; +all;;;;;;;;;;;;; +constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;; +text;text;text;text;text;m;;;;;;;; +C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;; +C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;; diff --git a/pdtable/test/io/test_csv.py b/pdtable/test/io/test_csv.py index d0969d8..a2fcc09 100644 --- a/pdtable/test/io/test_csv.py +++ b/pdtable/test/io/test_csv.py @@ -5,10 +5,12 @@ from pytest import fixture, raises import pandas as pd +import pytest import pdtable from pdtable import Table, BlockType, read_csv, write_csv from pdtable.io.csv import _table_to_csv +from pdtable.io.parsers.blocks import EncodingException from pdtable.table_metadata import ColumnFormat @@ -417,3 +419,15 @@ def test__table_is_preserved_when_written_to_and_read_from_csv(): assert table_read.column_names == table_write.column_names assert table_read.units == table_write.units assert table_read.destinations == table_write.destinations + + +def test_read_csv_starting_with_bom(): + only_tables_path = Path(__file__).parent / "input" / "only_tables.csv" + + with pytest.raises(EncodingException): + list(read_csv(source=only_tables_path)) + + source = open(only_tables_path, mode='r', encoding='utf-8-sig') + bls = list(read_csv(source=source)) + tables = [bl for ty, bl in bls if ty == BlockType.TABLE] + assert tables[0].name == "generic_inf" From 2199433c0fbae20f0365a2c2b84c3330b1af06cd Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Fri, 29 Mar 2024 13:12:40 +0100 Subject: [PATCH 2/6] continue --- pdtable/io/parsers/blocks.py | 3 +++ pdtable/test/io/input/only_tables_no_bom.csv | 13 +++++++++ ...es.csv => only_tables_starts_with_bom.csv} | 0 pdtable/test/io/test_csv.py | 27 ++++++++++++------- 4 files changed, 34 insertions(+), 9 deletions(-) create mode 100644 pdtable/test/io/input/only_tables_no_bom.csv rename pdtable/test/io/input/{only_tables.csv => only_tables_starts_with_bom.csv} (100%) diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index 70354fd..b72d07a 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -60,6 +60,9 @@ def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]: This function checks if we loaded the file content with a correct encoding and raise an EncodingException if not. """ + if isinstance(cell_rows, list): + cell_rows = iter(cell_rows) + first_cell_row = next(cell_rows) if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0: diff --git a/pdtable/test/io/input/only_tables_no_bom.csv b/pdtable/test/io/input/only_tables_no_bom.csv new file mode 100644 index 0000000..ba8245d --- /dev/null +++ b/pdtable/test/io/input/only_tables_no_bom.csv @@ -0,0 +1,13 @@ +**generic_inf;;;;;;;;;;;;; +all;;;;;;;;;;;;; +FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type +text;text;text;text;text;text;text;-;-;mm;text;text;text;- +C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1 +;;;;;;;;;;;;; +;;;;;;;;;;;;; +**generic_inf_constraints;;;;;;;;;;;;; +all;;;;;;;;;;;;; +constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;; +text;text;text;text;text;m;;;;;;;; +C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;; +C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;; diff --git a/pdtable/test/io/input/only_tables.csv b/pdtable/test/io/input/only_tables_starts_with_bom.csv similarity index 100% rename from pdtable/test/io/input/only_tables.csv rename to pdtable/test/io/input/only_tables_starts_with_bom.csv diff --git a/pdtable/test/io/test_csv.py b/pdtable/test/io/test_csv.py index a2fcc09..5311706 100644 --- a/pdtable/test/io/test_csv.py +++ b/pdtable/test/io/test_csv.py @@ -335,15 +335,18 @@ def test_read_csv__sep_is_comma(csv_data): assert len(template_rows) == 1 +_input_dir = Path(__file__).parent / "input" + + def test_read_csv__from_stream(): - with open(Path(__file__).parent / "input" / "bundle.csv", "r") as fh: + with open(_input_dir / "bundle.csv", "r") as fh: bls = list(read_csv(fh)) tables = [bl for ty, bl in bls if ty == BlockType.TABLE] assert tables[1].name == "spelling_numbers" # raises exception on common error if not text stream with raises(Exception): - with open(Path(__file__).parent / "input" / "bundle.csv", "rb") as fh: # binary stream! + with open(_input_dir / "bundle.csv", "rb") as fh: # binary stream! bls = list(read_csv(fh)) tables = [bl for ty, bl in bls if ty == BlockType.TABLE] @@ -421,13 +424,19 @@ def test__table_is_preserved_when_written_to_and_read_from_csv(): assert table_read.destinations == table_write.destinations -def test_read_csv_starting_with_bom(): - only_tables_path = Path(__file__).parent / "input" / "only_tables.csv" +def test_read_csv_only_tables_starting_with_bom(): + only_tables_starts_with_bom_path = _input_dir / "only_tables_starts_with_bom.csv" with pytest.raises(EncodingException): - list(read_csv(source=only_tables_path)) + list(read_csv(source=only_tables_starts_with_bom_path)) - source = open(only_tables_path, mode='r', encoding='utf-8-sig') - bls = list(read_csv(source=source)) - tables = [bl for ty, bl in bls if ty == BlockType.TABLE] - assert tables[0].name == "generic_inf" + source = open(only_tables_starts_with_bom_path, mode='r', encoding='utf-8-sig') + tables = list(read_csv(source=source)) + assert tables[0][1].name == "generic_inf" + + +def test_read_csv_only_tables_no_bom(): + only_tables_no_bom_path = _input_dir / "only_tables_no_bom.csv" + source = open(only_tables_no_bom_path, mode='r', encoding='utf-8-sig') + tables = list(read_csv(source=source)) + assert tables[0][1].name == "generic_inf" From 42336b90b902ea8705c4b0ab4549eca1cc7e083c Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Fri, 29 Mar 2024 13:55:11 +0100 Subject: [PATCH 3/6] UnknownMethodWarning introduced --- pdtable/frame.py | 7 ++++++- pdtable/test/test_pdtable.py | 22 ++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pdtable/frame.py b/pdtable/frame.py index edf4632..69876b7 100644 --- a/pdtable/frame.py +++ b/pdtable/frame.py @@ -67,6 +67,10 @@ class InvalidTableCombineError(Exception): pass +class UnknownMethodWarning(Warning): + pass + + def _combine_tables( obj: "TableDataFrame", other, method, **kwargs ) -> Optional[ComplementaryTableInfo]: @@ -97,7 +101,8 @@ def _combine_tables( warnings.warn( f'While combining pdTable metadata an unknown __finalize__ method "{method}" was encountered. ' f"Will try to propagate metadata with generic methods, but please check outcome of this " - f"and notify pdTable maintainers." + f"and notify pdTable maintainers.", + category=UnknownMethodWarning ) data = [d for d in (getattr(s, _TABLE_INFO_FIELD_NAME, None) for s in src) if d is not None] diff --git a/pdtable/test/test_pdtable.py b/pdtable/test/test_pdtable.py index 319694c..1519008 100644 --- a/pdtable/test/test_pdtable.py +++ b/pdtable/test/test_pdtable.py @@ -380,11 +380,17 @@ def table_data_frame() -> frame.TableDataFrame: ) +def unknown_method_warnings(warnings_list: list[Warning]) -> list[frame.UnknownMethodWarning]: + return [ + warning for warning in warnings_list + if issubclass(warning.category, frame.UnknownMethodWarning) + ] + class TestFinalize: def test_replace_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame.replace('a', 'z') - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 def test_replace_not_allowed_unit(self, table_data_frame: frame.TableDataFrame) -> None: with pytest.raises(ColumnUnitException): @@ -395,7 +401,7 @@ def test_sort_index_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame.sort_index() - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 def test_transpose_ok(self, table_data_frame: frame.TableDataFrame) -> None: """ @@ -411,7 +417,7 @@ def test_astype_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame_new_type = table_data_frame.astype({'B': float}) - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 assert isinstance(table_data_frame_new_type['B'].iloc[0], np.float64) @@ -429,7 +435,7 @@ def test_append_with_loc_ok(self, table_data_frame: frame.TableDataFrame) -> Non """ with warnings.catch_warnings(record=True) as w: table_data_frame.loc[999] = {'A': 'y', 'B': 1, 'C': True} - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 assert 6 == table_data_frame.shape[0] @@ -443,7 +449,7 @@ def test_fillna_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame_new_type.fillna(123) - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 @pytest.mark.skipif( sys.version_info < (3, 8), @@ -466,7 +472,7 @@ def test_rename_columns(self, table_data_frame: frame.TableDataFrame) -> None: def test_rename_index(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame.rename(index={1: 'a', 2: 'b'}) - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None: """ @@ -483,7 +489,7 @@ def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: unstacked_table_data_frame = table_data_frame.unstack() - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 unstacked_col_name_to_unit = { name: col.unit for name, col in object.__getattribute__( @@ -510,7 +516,7 @@ def test_melt(self, table_data_frame: frame.TableDataFrame) -> None: """ with warnings.catch_warnings(record=True) as w: melted_table_data_frame = table_data_frame.melt(id_vars=['A'], value_vars=['B', 'C']) - assert len(w) == 0 + assert len(unknown_method_warnings(warnings_list=w)) == 0 melted_col_name_to_unit = { name: col.unit for name, col in object.__getattribute__( From 29f66ea396b6277c937beb4daa8814655409f864 Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Fri, 29 Mar 2024 13:58:00 +0100 Subject: [PATCH 4/6] use old fashion typing to be compatible with python3.7 --- pdtable/test/test_pdtable.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pdtable/test/test_pdtable.py b/pdtable/test/test_pdtable.py index 1519008..b321e8b 100644 --- a/pdtable/test/test_pdtable.py +++ b/pdtable/test/test_pdtable.py @@ -1,5 +1,6 @@ import sys from textwrap import dedent +from typing import List import warnings import pandas as pd @@ -380,7 +381,7 @@ def table_data_frame() -> frame.TableDataFrame: ) -def unknown_method_warnings(warnings_list: list[Warning]) -> list[frame.UnknownMethodWarning]: +def _unknown_method_warnings(warnings_list: List[Warning]) -> List[frame.UnknownMethodWarning]: return [ warning for warning in warnings_list if issubclass(warning.category, frame.UnknownMethodWarning) @@ -390,7 +391,7 @@ class TestFinalize: def test_replace_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame.replace('a', 'z') - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 def test_replace_not_allowed_unit(self, table_data_frame: frame.TableDataFrame) -> None: with pytest.raises(ColumnUnitException): @@ -401,7 +402,7 @@ def test_sort_index_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame.sort_index() - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 def test_transpose_ok(self, table_data_frame: frame.TableDataFrame) -> None: """ @@ -417,7 +418,7 @@ def test_astype_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame_new_type = table_data_frame.astype({'B': float}) - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 assert isinstance(table_data_frame_new_type['B'].iloc[0], np.float64) @@ -435,7 +436,7 @@ def test_append_with_loc_ok(self, table_data_frame: frame.TableDataFrame) -> Non """ with warnings.catch_warnings(record=True) as w: table_data_frame.loc[999] = {'A': 'y', 'B': 1, 'C': True} - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 assert 6 == table_data_frame.shape[0] @@ -449,7 +450,7 @@ def test_fillna_ok(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame_new_type.fillna(123) - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 @pytest.mark.skipif( sys.version_info < (3, 8), @@ -472,7 +473,7 @@ def test_rename_columns(self, table_data_frame: frame.TableDataFrame) -> None: def test_rename_index(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: table_data_frame.rename(index={1: 'a', 2: 'b'}) - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None: """ @@ -489,7 +490,7 @@ def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None: with warnings.catch_warnings(record=True) as w: unstacked_table_data_frame = table_data_frame.unstack() - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 unstacked_col_name_to_unit = { name: col.unit for name, col in object.__getattribute__( @@ -516,7 +517,7 @@ def test_melt(self, table_data_frame: frame.TableDataFrame) -> None: """ with warnings.catch_warnings(record=True) as w: melted_table_data_frame = table_data_frame.melt(id_vars=['A'], value_vars=['B', 'C']) - assert len(unknown_method_warnings(warnings_list=w)) == 0 + assert len(_unknown_method_warnings(warnings_list=w)) == 0 melted_col_name_to_unit = { name: col.unit for name, col in object.__getattribute__( From 2d5829228d585f85f9d685e35487682aecfe2c96 Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Tue, 2 Apr 2024 11:28:53 +0200 Subject: [PATCH 5/6] fix check_encoding --- pdtable/io/parsers/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index b72d07a..2998dae 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -65,7 +65,7 @@ def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]: first_cell_row = next(cell_rows) - if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0: + if first_cell_row and len(first_cell_row) > 0 and first_cell_row[0]: first_sign = first_cell_row[0][0] try: From d5f00bd3c56ccc9aedfb73cc3d707656989ccc3a Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Tue, 2 Apr 2024 15:25:23 +0200 Subject: [PATCH 6/6] fix check_encoding --- pdtable/io/parsers/blocks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index 2998dae..70e32e0 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -63,7 +63,10 @@ def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]: if isinstance(cell_rows, list): cell_rows = iter(cell_rows) - first_cell_row = next(cell_rows) + try: + first_cell_row = next(cell_rows) + except StopIteration: + return # generator is empty, do not yield anything if first_cell_row and len(first_cell_row) > 0 and first_cell_row[0]: first_sign = first_cell_row[0][0]