From de8678c98b4a7b165ca2cefc926d932c0e87c48d Mon Sep 17 00:00:00 2001
From: Jan Bielecki <jbiel@orsted.com>
Date: Fri, 29 Mar 2024 12:52:53 +0100
Subject: [PATCH 1/6] init

---
 pdtable/io/parsers/blocks.py          | 37 ++++++++++++++++++++++++---
 pdtable/test/io/input/only_tables.csv | 13 ++++++++++
 pdtable/test/io/test_csv.py           | 14 ++++++++++
 3 files changed, 60 insertions(+), 4 deletions(-)
 create mode 100644 pdtable/test/io/input/only_tables.csv

diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py
index b45ca34..70354fd 100644
--- a/pdtable/io/parsers/blocks.py
+++ b/pdtable/io/parsers/blocks.py
@@ -24,11 +24,9 @@
   - The original, raw cell grid, in case the user wants to do some low-level processing.
 
 """
-from abc import abstractmethod
 import itertools
 import re
 from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict
-from collections import defaultdict
 import pandas as pd
 import warnings
 
@@ -39,7 +37,6 @@
     LocationSheet,
     NullLocationFile,
     TableOrigin,
-    InputIssue,
     InputIssueTracker,
     NullInputIssueTracker,
 )
@@ -49,6 +46,37 @@
 from ...auxiliary import MetadataBlock, Directive
 from ...table_metadata import TableMetadata
 
+
+class EncodingException(Exception):
+    pass
+
+
+def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]:
+    """
+    CSV file can have a BOM character at the start.
+    Reading file with a default encoding does not raise an issue, 
+    but in such a case we ignore the first line 
+    (and the whole table if the file starts with a table block).
+    This function checks if we loaded the file content with a correct encoding 
+    and raise an EncodingException if not.
+    """
+    first_cell_row = next(cell_rows)
+
+    if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0:
+        first_sign = first_cell_row[0][0]
+
+        try:
+            first_sign.encode("ascii")
+        except UnicodeEncodeError:
+            raise EncodingException(
+                f'File starts with no ascii character "{first_sign}". '
+                'Please verify the file encoding.'
+            )
+
+    yield first_cell_row
+    yield from cell_rows
+
+
 # Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col]
 CellGrid = Sequence[Sequence]
 
@@ -451,7 +479,8 @@ def block_output(block_type, cell_grid, row: int):
     state = BlockType.METADATA
     next_state = None
     this_block_1st_row = 0
-    for row_number_0based, row in enumerate(cell_rows):
+
+    for row_number_0based, row in enumerate(check_encoding(cell_rows)):
         if row is None or len(row) == 0 or _is_cell_blank(row[0]):
             if state != BlockType.BLANK:
                 next_state = BlockType.BLANK
diff --git a/pdtable/test/io/input/only_tables.csv b/pdtable/test/io/input/only_tables.csv
new file mode 100644
index 0000000..1e485e6
--- /dev/null
+++ b/pdtable/test/io/input/only_tables.csv
@@ -0,0 +1,13 @@
+﻿**generic_inf;;;;;;;;;;;;;
+all;;;;;;;;;;;;;
+FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type
+text;text;text;text;text;text;text;-;-;mm;text;text;text;-
+C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1
+;;;;;;;;;;;;;
+;;;;;;;;;;;;;
+**generic_inf_constraints;;;;;;;;;;;;;
+all;;;;;;;;;;;;;
+constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;;
+text;text;text;text;text;m;;;;;;;;
+C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;;
+C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;;
diff --git a/pdtable/test/io/test_csv.py b/pdtable/test/io/test_csv.py
index d0969d8..a2fcc09 100644
--- a/pdtable/test/io/test_csv.py
+++ b/pdtable/test/io/test_csv.py
@@ -5,10 +5,12 @@
 
 from pytest import fixture, raises
 import pandas as pd
+import pytest
 
 import pdtable
 from pdtable import Table, BlockType, read_csv, write_csv
 from pdtable.io.csv import _table_to_csv
+from pdtable.io.parsers.blocks import EncodingException
 from pdtable.table_metadata import ColumnFormat
 
 
@@ -417,3 +419,15 @@ def test__table_is_preserved_when_written_to_and_read_from_csv():
     assert table_read.column_names == table_write.column_names
     assert table_read.units == table_write.units
     assert table_read.destinations == table_write.destinations
+
+
+def test_read_csv_starting_with_bom():
+    only_tables_path = Path(__file__).parent / "input" / "only_tables.csv"
+    
+    with pytest.raises(EncodingException):
+        list(read_csv(source=only_tables_path))
+    
+    source = open(only_tables_path, mode='r', encoding='utf-8-sig')
+    bls = list(read_csv(source=source))
+    tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
+    assert tables[0].name == "generic_inf"

From 2199433c0fbae20f0365a2c2b84c3330b1af06cd Mon Sep 17 00:00:00 2001
From: Jan Bielecki <jbiel@orsted.com>
Date: Fri, 29 Mar 2024 13:12:40 +0100
Subject: [PATCH 2/6] continue

---
 pdtable/io/parsers/blocks.py                  |  3 +++
 pdtable/test/io/input/only_tables_no_bom.csv  | 13 +++++++++
 ...es.csv => only_tables_starts_with_bom.csv} |  0
 pdtable/test/io/test_csv.py                   | 27 ++++++++++++-------
 4 files changed, 34 insertions(+), 9 deletions(-)
 create mode 100644 pdtable/test/io/input/only_tables_no_bom.csv
 rename pdtable/test/io/input/{only_tables.csv => only_tables_starts_with_bom.csv} (100%)

diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py
index 70354fd..b72d07a 100644
--- a/pdtable/io/parsers/blocks.py
+++ b/pdtable/io/parsers/blocks.py
@@ -60,6 +60,9 @@ def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]:
     This function checks if we loaded the file content with a correct encoding 
     and raise an EncodingException if not.
     """
+    if isinstance(cell_rows, list):
+      cell_rows = iter(cell_rows)
+
     first_cell_row = next(cell_rows)
 
     if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0:
diff --git a/pdtable/test/io/input/only_tables_no_bom.csv b/pdtable/test/io/input/only_tables_no_bom.csv
new file mode 100644
index 0000000..ba8245d
--- /dev/null
+++ b/pdtable/test/io/input/only_tables_no_bom.csv
@@ -0,0 +1,13 @@
+**generic_inf;;;;;;;;;;;;;
+all;;;;;;;;;;;;;
+FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type
+text;text;text;text;text;text;text;-;-;mm;text;text;text;-
+C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1
+;;;;;;;;;;;;;
+;;;;;;;;;;;;;
+**generic_inf_constraints;;;;;;;;;;;;;
+all;;;;;;;;;;;;;
+constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;;
+text;text;text;text;text;m;;;;;;;;
+C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;;
+C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;;
diff --git a/pdtable/test/io/input/only_tables.csv b/pdtable/test/io/input/only_tables_starts_with_bom.csv
similarity index 100%
rename from pdtable/test/io/input/only_tables.csv
rename to pdtable/test/io/input/only_tables_starts_with_bom.csv
diff --git a/pdtable/test/io/test_csv.py b/pdtable/test/io/test_csv.py
index a2fcc09..5311706 100644
--- a/pdtable/test/io/test_csv.py
+++ b/pdtable/test/io/test_csv.py
@@ -335,15 +335,18 @@ def test_read_csv__sep_is_comma(csv_data):
     assert len(template_rows) == 1
 
 
+_input_dir = Path(__file__).parent / "input"
+
+
 def test_read_csv__from_stream():
-    with open(Path(__file__).parent / "input" / "bundle.csv", "r") as fh:
+    with open(_input_dir / "bundle.csv", "r") as fh:
         bls = list(read_csv(fh))
         tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
         assert tables[1].name == "spelling_numbers"
 
     # raises exception on common error if not text stream
     with raises(Exception):
-        with open(Path(__file__).parent / "input" / "bundle.csv", "rb") as fh:  # binary stream!
+        with open(_input_dir / "bundle.csv", "rb") as fh:  # binary stream!
             bls = list(read_csv(fh))
             tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
 
@@ -421,13 +424,19 @@ def test__table_is_preserved_when_written_to_and_read_from_csv():
     assert table_read.destinations == table_write.destinations
 
 
-def test_read_csv_starting_with_bom():
-    only_tables_path = Path(__file__).parent / "input" / "only_tables.csv"
+def test_read_csv_only_tables_starting_with_bom():
+    only_tables_starts_with_bom_path = _input_dir / "only_tables_starts_with_bom.csv"
     
     with pytest.raises(EncodingException):
-        list(read_csv(source=only_tables_path))
+        list(read_csv(source=only_tables_starts_with_bom_path))
     
-    source = open(only_tables_path, mode='r', encoding='utf-8-sig')
-    bls = list(read_csv(source=source))
-    tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
-    assert tables[0].name == "generic_inf"
+    source = open(only_tables_starts_with_bom_path, mode='r', encoding='utf-8-sig')
+    tables = list(read_csv(source=source))
+    assert tables[0][1].name == "generic_inf"
+
+
+def test_read_csv_only_tables_no_bom():
+    only_tables_no_bom_path = _input_dir / "only_tables_no_bom.csv"
+    source = open(only_tables_no_bom_path, mode='r', encoding='utf-8-sig')
+    tables = list(read_csv(source=source))
+    assert tables[0][1].name == "generic_inf"

From 42336b90b902ea8705c4b0ab4549eca1cc7e083c Mon Sep 17 00:00:00 2001
From: Jan Bielecki <jbiel@orsted.com>
Date: Fri, 29 Mar 2024 13:55:11 +0100
Subject: [PATCH 3/6] UnknownMethodWarning introduced

---
 pdtable/frame.py             |  7 ++++++-
 pdtable/test/test_pdtable.py | 22 ++++++++++++++--------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/pdtable/frame.py b/pdtable/frame.py
index edf4632..69876b7 100644
--- a/pdtable/frame.py
+++ b/pdtable/frame.py
@@ -67,6 +67,10 @@ class InvalidTableCombineError(Exception):
     pass
 
 
+class UnknownMethodWarning(Warning):
+    pass
+
+
 def _combine_tables(
     obj: "TableDataFrame", other, method, **kwargs
 ) -> Optional[ComplementaryTableInfo]:
@@ -97,7 +101,8 @@ def _combine_tables(
         warnings.warn(
             f'While combining pdTable metadata an unknown __finalize__ method "{method}" was encountered. '
             f"Will try to propagate metadata with generic methods, but please check outcome of this "
-            f"and notify pdTable maintainers."
+            f"and notify pdTable maintainers.",
+            category=UnknownMethodWarning
         )
 
     data = [d for d in (getattr(s, _TABLE_INFO_FIELD_NAME, None) for s in src) if d is not None]
diff --git a/pdtable/test/test_pdtable.py b/pdtable/test/test_pdtable.py
index 319694c..1519008 100644
--- a/pdtable/test/test_pdtable.py
+++ b/pdtable/test/test_pdtable.py
@@ -380,11 +380,17 @@ def table_data_frame() -> frame.TableDataFrame:
     )
 
 
+def unknown_method_warnings(warnings_list: list[Warning]) -> list[frame.UnknownMethodWarning]:
+    return [
+        warning for warning in warnings_list
+        if issubclass(warning.category, frame.UnknownMethodWarning)
+    ]
+
 class TestFinalize:
     def test_replace_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.replace('a', 'z')
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
 
     def test_replace_not_allowed_unit(self, table_data_frame: frame.TableDataFrame) -> None:
         with pytest.raises(ColumnUnitException):
@@ -395,7 +401,7 @@ def test_sort_index_ok(self, table_data_frame: frame.TableDataFrame) -> None:
 
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.sort_index()
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
 
     def test_transpose_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         """
@@ -411,7 +417,7 @@ def test_astype_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         
         with warnings.catch_warnings(record=True) as w:
             table_data_frame_new_type = table_data_frame.astype({'B': float})
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
 
         assert isinstance(table_data_frame_new_type['B'].iloc[0], np.float64)
 
@@ -429,7 +435,7 @@ def test_append_with_loc_ok(self, table_data_frame: frame.TableDataFrame) -> Non
         """
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.loc[999] = {'A': 'y', 'B': 1, 'C': True}
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
         
         assert 6 == table_data_frame.shape[0]
 
@@ -443,7 +449,7 @@ def test_fillna_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         
         with warnings.catch_warnings(record=True) as w:
             table_data_frame_new_type.fillna(123)
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
 
     @pytest.mark.skipif(
         sys.version_info < (3, 8),
@@ -466,7 +472,7 @@ def test_rename_columns(self, table_data_frame: frame.TableDataFrame) -> None:
     def test_rename_index(self, table_data_frame: frame.TableDataFrame) -> None:
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.rename(index={1: 'a', 2: 'b'})
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
 
     def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None:
         """
@@ -483,7 +489,7 @@ def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None:
         
         with warnings.catch_warnings(record=True) as w:
             unstacked_table_data_frame = table_data_frame.unstack()
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
 
         unstacked_col_name_to_unit = {
             name: col.unit for name, col in object.__getattribute__(
@@ -510,7 +516,7 @@ def test_melt(self, table_data_frame: frame.TableDataFrame) -> None:
         """
         with warnings.catch_warnings(record=True) as w:
             melted_table_data_frame = table_data_frame.melt(id_vars=['A'], value_vars=['B', 'C'])
-            assert len(w) == 0
+            assert len(unknown_method_warnings(warnings_list=w)) == 0
         
         melted_col_name_to_unit = {
             name: col.unit for name, col in object.__getattribute__(

From 29f66ea396b6277c937beb4daa8814655409f864 Mon Sep 17 00:00:00 2001
From: Jan Bielecki <jbiel@orsted.com>
Date: Fri, 29 Mar 2024 13:58:00 +0100
Subject: [PATCH 4/6] use old fashion typing to be compatible with python3.7

---
 pdtable/test/test_pdtable.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/pdtable/test/test_pdtable.py b/pdtable/test/test_pdtable.py
index 1519008..b321e8b 100644
--- a/pdtable/test/test_pdtable.py
+++ b/pdtable/test/test_pdtable.py
@@ -1,5 +1,6 @@
 import sys
 from textwrap import dedent
+from typing import List
 import warnings
 
 import pandas as pd
@@ -380,7 +381,7 @@ def table_data_frame() -> frame.TableDataFrame:
     )
 
 
-def unknown_method_warnings(warnings_list: list[Warning]) -> list[frame.UnknownMethodWarning]:
+def _unknown_method_warnings(warnings_list: List[Warning]) -> List[frame.UnknownMethodWarning]:
     return [
         warning for warning in warnings_list
         if issubclass(warning.category, frame.UnknownMethodWarning)
@@ -390,7 +391,7 @@ class TestFinalize:
     def test_replace_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.replace('a', 'z')
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
 
     def test_replace_not_allowed_unit(self, table_data_frame: frame.TableDataFrame) -> None:
         with pytest.raises(ColumnUnitException):
@@ -401,7 +402,7 @@ def test_sort_index_ok(self, table_data_frame: frame.TableDataFrame) -> None:
 
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.sort_index()
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
 
     def test_transpose_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         """
@@ -417,7 +418,7 @@ def test_astype_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         
         with warnings.catch_warnings(record=True) as w:
             table_data_frame_new_type = table_data_frame.astype({'B': float})
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
 
         assert isinstance(table_data_frame_new_type['B'].iloc[0], np.float64)
 
@@ -435,7 +436,7 @@ def test_append_with_loc_ok(self, table_data_frame: frame.TableDataFrame) -> Non
         """
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.loc[999] = {'A': 'y', 'B': 1, 'C': True}
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
         
         assert 6 == table_data_frame.shape[0]
 
@@ -449,7 +450,7 @@ def test_fillna_ok(self, table_data_frame: frame.TableDataFrame) -> None:
         
         with warnings.catch_warnings(record=True) as w:
             table_data_frame_new_type.fillna(123)
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
 
     @pytest.mark.skipif(
         sys.version_info < (3, 8),
@@ -472,7 +473,7 @@ def test_rename_columns(self, table_data_frame: frame.TableDataFrame) -> None:
     def test_rename_index(self, table_data_frame: frame.TableDataFrame) -> None:
         with warnings.catch_warnings(record=True) as w:
             table_data_frame.rename(index={1: 'a', 2: 'b'})
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
 
     def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None:
         """
@@ -489,7 +490,7 @@ def test_unstack(self, table_data_frame: frame.TableDataFrame) -> None:
         
         with warnings.catch_warnings(record=True) as w:
             unstacked_table_data_frame = table_data_frame.unstack()
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
 
         unstacked_col_name_to_unit = {
             name: col.unit for name, col in object.__getattribute__(
@@ -516,7 +517,7 @@ def test_melt(self, table_data_frame: frame.TableDataFrame) -> None:
         """
         with warnings.catch_warnings(record=True) as w:
             melted_table_data_frame = table_data_frame.melt(id_vars=['A'], value_vars=['B', 'C'])
-            assert len(unknown_method_warnings(warnings_list=w)) == 0
+            assert len(_unknown_method_warnings(warnings_list=w)) == 0
         
         melted_col_name_to_unit = {
             name: col.unit for name, col in object.__getattribute__(

From 2d5829228d585f85f9d685e35487682aecfe2c96 Mon Sep 17 00:00:00 2001
From: Jan Bielecki <jbiel@orsted.com>
Date: Tue, 2 Apr 2024 11:28:53 +0200
Subject: [PATCH 5/6] fix check_encoding

---
 pdtable/io/parsers/blocks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py
index b72d07a..2998dae 100644
--- a/pdtable/io/parsers/blocks.py
+++ b/pdtable/io/parsers/blocks.py
@@ -65,7 +65,7 @@ def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]:
 
     first_cell_row = next(cell_rows)
 
-    if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0:
+    if first_cell_row and len(first_cell_row) > 0 and first_cell_row[0]:
         first_sign = first_cell_row[0][0]
 
         try:

From d5f00bd3c56ccc9aedfb73cc3d707656989ccc3a Mon Sep 17 00:00:00 2001
From: Jan Bielecki <jbiel@orsted.com>
Date: Tue, 2 Apr 2024 15:25:23 +0200
Subject: [PATCH 6/6] fix check_encoding

---
 pdtable/io/parsers/blocks.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py
index 2998dae..70e32e0 100644
--- a/pdtable/io/parsers/blocks.py
+++ b/pdtable/io/parsers/blocks.py
@@ -63,7 +63,10 @@ def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]:
     if isinstance(cell_rows, list):
       cell_rows = iter(cell_rows)
 
-    first_cell_row = next(cell_rows)
+    try:
+        first_cell_row = next(cell_rows)
+    except StopIteration:
+        return  # generator is empty, do not yield anything
 
     if first_cell_row and len(first_cell_row) > 0 and first_cell_row[0]:
         first_sign = first_cell_row[0][0]