From 3b3024e67715d9c657f9d5f51fb3c3069b8aeeff Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 10:41:39 +0000 Subject: [PATCH 01/17] shifted to modern toml approach --- README.md | 12 +++++++++++- pyproject.toml | 24 ++++++++++++++++++++++++ setup.py | 26 -------------------------- 3 files changed, 35 insertions(+), 27 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/README.md b/README.md index 678a7c1..e9e8b1c 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,19 @@ Use pip: `pip install access-parser` Or install manually: ```bash +# Clone the repository git clone https://github.com/ClarotyICS/access_parser.git cd access_parser -python3 setup.py install + +# Create a virtual environment (recommended) +python3 -m venv .venv # On Windows use: py -m venv .venv +source .venv/bin/activate # On Windows use: .venv\Scripts\activate + +# Install using pip (modern approach) +pip install . + +# Verify installation +python -c "import access_parser; print('Installed successfully')" ``` # Demo diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c2f43ff --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[project] +name = "access_parser" +version = "0.0.6" +description = "Access database (*.mdb, *.accdb) parser" +readme = "README.md" +authors = [{ name = "Uri Katz", email = "uri.k@claroty.com" }] +license = { text = "Apache Software License" } +requires-python = ">=3.6" +dependencies = [ + "construct", + "tabulate" +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent" +] + +[project.urls] +Homepage = "https://github.com/ClarotyICS/access_parser" + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py deleted file mode 100644 index f5c0879..0000000 --- a/setup.py +++ /dev/null @@ -1,26 +0,0 @@ -import setuptools - -with open("README.md", "r") as f: - long_description = f.read() - -setuptools.setup( - name="access_parser", - version="0.0.6", - author="Uri Katz", - author_email="uri.k@claroty.com", - description="Access database (*.mdb, *.accdb) parser", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/ClarotyICS/access_parser", - packages=setuptools.find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - python_requires='>=3.6', - install_requires=[ - 'construct', - 'tabulate', - ], -) From a491c7385b64fd7e563eaecb700e4993ec862548 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 10:56:40 +0000 Subject: [PATCH 02/17] overhauled text decoding to handle unicode compression, and version based charsets --- access_parser/utils.py | 110 +++++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 21 deletions(-) diff --git a/access_parser/utils.py b/access_parser/utils.py index 0153be2..ee15030 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -55,6 +55,12 @@ FORMAT_SCIENTIFIC: SCIENTIFIC_DEFAULT } +# Character Encodings for Different Jet Versions +ENCODING_MAP = { + 3: "cp1252", # Jet 3.x (Access 97 and earlier) + 4: "utf-16-le" # Jet 4.x+ (Access 2000 and newer) +} +TEXT_COMPRESSION_HEADER = b'\xff\xfe' # https://stackoverflow.com/questions/45560782 def mdb_date_to_readable(double_time): @@ -86,16 +92,90 @@ def numeric_to_string(bytes_num, scale=6): numeric_string += full_number return numeric_string +###Text type decoding functions +def decodeTextValue(data: bytes, version: int): + """Decodes a compressed or uncompressed text value.""" + + # Jet 3 does not support Unicode compression; decode directly + if version == 3: + return decodeUncompressedText(data, 0, len(data), version) + + # Check for Unicode compression header (Jet 4+ only) + isCompressed = len(data) > 1 and data.startswith(TEXT_COMPRESSION_HEADER) + + if isCompressed: + textBuf = '' + dataStart = len(TEXT_COMPRESSION_HEADER) + dataEnd = dataStart + inCompressedMode = True + + # Process each segment in the compressed data + while dataEnd < len(data): + if data[dataEnd:dataEnd+1] == b'\x00': # End of segment + # Decode the current segment and toggle compression mode + textBuf += decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, version) + inCompressedMode = not inCompressedMode + dataStart = dataEnd + 1 + dataEnd += 1 + + # Handle the last segment + textBuf += decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, version) + return textBuf + + return decodeUncompressedText(data, 0, len(data), version) + + +def decodeTextSegment(data: bytes, dataStart: int, dataEnd: int, inCompressedMode: bool,version: int): + """ + Decodes a segment of a text value into the given buffer according to the + given status of the segment (compressed/uncompressed). + """ + if dataEnd <= dataStart: + return '' # No data in the segment + + if inCompressedMode: + # Extract the relevant segment. + segment = data[dataStart:dataEnd] + # Create a new bytearray twice as long as the segment. + expanded = bytearray(len(segment) * 2) + # Using slice assignment: assign the original bytes to every even index. + # The odd indices will remain 0, which is exactly the padding needed. + expanded[::2] = segment + # Convert the bytearray back to an immutable bytes object. + data = bytes(expanded) + dataStart = 0 + dataEnd = len(data) -def get_decoded_text(bytes_data): + return decodeUncompressedText(data, dataStart, dataEnd, version) + + +def decodeUncompressedText(textBytes: bytes, dataStart: int, dataEnd: int, version: int, strict: bool = False) -> str: + """ + Decodes uncompressed text based on database version. + + :param textBytes: The raw bytes of text. + :param dataStart: Start index of the text segment. + :param dataEnd: End index of the text segment. + :param version: The database version to determine encoding. + :param strict: Whether to raise an error on decoding failure. If False, + decoding errors are logged and replacement characters are used. + :return: Decoded text string. + """ + encoding = ENCODING_MAP.get(version, "utf-16-le") # Default to utf-16-le for unknown versions + bytesToDecode = textBytes[dataStart:dataEnd] + try: - decoded = bytes_data.decode('utf-8') - except UnicodeDecodeError: - try: - decoded = bytes_data.decode('latin1') - except UnicodeDecodeError: - decoded = bytes_data.decode('utf-8', errors='ignore') - return decoded + return bytesToDecode.decode(encoding) + except UnicodeDecodeError as e: + message = (f"Decoding error: Data could not be decoded using {encoding}. " + f"Possible corruption or unexpected encoding in the data segment " + f"from {dataStart} to {dataEnd}.") + if strict: + raise ValueError(message) from e + else: + LOGGER.warning(message) + # Return a best-effort result using replacement characters for undecodable bytes + return bytesToDecode.decode(encoding, errors="replace") def parse_money_type(parsed, prop_format): @@ -178,19 +258,7 @@ def parse_type(data_type, buffer, length=None, version=3, props=None): elif data_type == TYPE_96_bit_17_BYTES: parsed = buffer[:17] elif data_type == TYPE_TEXT: - if version > 3: - # Looks like if BOM is present text is already decoded - if buffer.startswith(b"\xfe\xff") or buffer.startswith(b"\xff\xfe"): - buff = buffer[2:] - parsed = get_decoded_text(buff) - else: - parsed = buffer.decode("utf-16", errors='ignore') - else: - parsed = get_decoded_text(buffer) - - if "\x00" in parsed: - LOGGER.debug(f"Parsed string contains NUL (0x00) characters: {parsed}") - parsed = parsed.replace("\x00", "") + parsed = decodeTextValue(buffer,version) else: LOGGER.debug(f"parse_type - unsupported data type: {data_type}") return parsed From 1c93e09c55454d08be8e88918dfb9c6395efb027 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 11:17:13 +0000 Subject: [PATCH 03/17] Fixed handling of dates before access_epoch the value is signed so changed from Q to q --- access_parser/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/access_parser/utils.py b/access_parser/utils.py index ee15030..92d9388 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -65,7 +65,7 @@ # https://stackoverflow.com/questions/45560782 def mdb_date_to_readable(double_time): try: - dtime_bytes = struct.pack("Q", double_time) + dtime_bytes = struct.pack("q", double_time) dtime_double = struct.unpack(' Date: Fri, 7 Feb 2025 11:18:17 +0000 Subject: [PATCH 04/17] fixed small number to text padding. --- access_parser/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/access_parser/utils.py b/access_parser/utils.py index 92d9388..f86f26b 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -88,6 +88,9 @@ def numeric_to_string(bytes_num, scale=6): if len(full_number) > scale: dot_len = len(full_number) - scale full_number = full_number[:dot_len] + "." + full_number[dot_len:] + # if number is smaller than scale then pad the number with relevant leading zeros. + if len(full_number) <= scale: + full_number = '0.' + ('0'*scale + full_number)[-scale:] numeric_string = "-" if neg else "" numeric_string += full_number return numeric_string From fb1326c7c9a19f36cde6caa32ea7205ddcddbf46 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 11:24:53 +0000 Subject: [PATCH 05/17] minor formatting standardisation --- access_parser/access_parser.py | 4 ++-- access_parser/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 2cdfc09..487eecd 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -8,7 +8,7 @@ from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \ ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \ - TYPE_96_bit_17_BYTES, TYPE_OLE + TYPE_96_BIT_17_BYTES, TYPE_OLE # Page sizes PAGE_SIZE_V3 = 0x800 @@ -435,7 +435,7 @@ def _parse_dynamic_length_data(self, original_record, relative_record_metadata, except ConstructError: LOGGER.warning("Failed to parse OLE field. Using data as bytes") parsed_type = relative_obj_data - elif column.type == TYPE_96_bit_17_BYTES: + elif column.type == TYPE_96_BIT_17_BYTES: if len(relative_obj_data) != 17: LOGGER.warning(f"Relative numeric field has invalid length {len(relative_obj_data)}, expected 17") parsed_type = relative_obj_data diff --git a/access_parser/utils.py b/access_parser/utils.py index f86f26b..2c48a4a 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -21,7 +21,7 @@ TYPE_OLE = 11 TYPE_MEMO = 12 TYPE_GUID = 15 -TYPE_96_bit_17_BYTES = 16 +TYPE_96_BIT_17_BYTES = 16 TYPE_COMPLEX = 18 TABLE_PAGE_MAGIC = b"\x02\x01" @@ -258,7 +258,7 @@ def parse_type(data_type, buffer, length=None, version=3, props=None): parsed = buffer[:16] guid = uuid.UUID(parsed.hex()) parsed = str(guid) - elif data_type == TYPE_96_bit_17_BYTES: + elif data_type == TYPE_96_BIT_17_BYTES: parsed = buffer[:17] elif data_type == TYPE_TEXT: parsed = decodeTextValue(buffer,version) From 7ffc86bca58a9336d9ef4e8eb187e6912d9fbc47 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:21:01 +0000 Subject: [PATCH 06/17] add ability to pass db object as bytes instead of reading from file location. --- access_parser/access_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 487eecd..6e251fb 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -37,7 +37,10 @@ def __init__(self, offset, val): class AccessParser(object): def __init__(self, db_path): - self.db_data = read_db_file(db_path) + if isinstance(db_path, bytes): # allow to pass bytes object e.g. downloaded from cloud storage + self.db_data = db_path + else: + self.db_data = read_db_file(db_path) self._parse_file_header(self.db_data) self._table_defs, self._data_pages, self._all_pages = categorize_pages(self.db_data, self.page_size) self._tables_with_data = self._link_tables_to_data() From e319e64d1d0ed40f3d1de0c8781ac0825f7a5fa4 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:22:28 +0000 Subject: [PATCH 07/17] the functions that could call this, expect an array rather than an empty string. --- access_parser/access_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 6e251fb..3561dfe 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -217,7 +217,7 @@ def create_empty_table(self): parsed_table = defaultdict(list) columns, *_ = self._get_table_columns() for i, column in columns.items(): - parsed_table[column.col_name_str] = "" + parsed_table[column.col_name_str] = [] #changed to blank array to align to expected type if data was present. return parsed_table def parse(self): From dd1c94fc7c3f81b7de41f7494fe6599627b52d2b Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:45:59 +0000 Subject: [PATCH 08/17] Table page usage map parsing. If the db is acted on to delete records it can lead to out of date "table_linked_pages", There is a separate pointer in the header for each table to a page usage map. These usage maps need to be parsed to identify all current related data/free space pages various tweaks required to ensure the page lists are availble to the relevant functions. It is these owned pages that must be parsed to ensure a full result is generated. also the check at the end of the get_overflow_record was stopping the real overflwo record ends being identified returning the rest of the page rather than just the end of the record. if updated to identify end correctly. --- access_parser/access_parser.py | 88 ++++++++++++++++++++++++++--- access_parser/parsing_primitives.py | 28 ++++++++- 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 3561dfe..3bdb60f 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -6,7 +6,7 @@ from tabulate import tabulate from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \ - ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP + ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP, parse_buffer_custom from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \ TYPE_96_BIT_17_BYTES, TYPE_OLE @@ -33,6 +33,8 @@ def __init__(self, offset, val): self.value = val self.offset = offset self.linked_pages = [] + self.owned_pages = [] + self.free_space_pages = [] class AccessParser(object): @@ -113,7 +115,7 @@ def _parse_catalog(self): :return: dict {table : offset} """ catalog_page = self._tables_with_data[2 * self.page_size] - access_table = AccessTable(catalog_page, self.version, self.page_size, self._data_pages, self._table_defs) + access_table = AccessTable(catalog_page, self.version, self.page_size, self._data_pages, self._table_defs, self._all_pages) catalog = access_table.parse() tables_mapping = {} for i, table_name in enumerate(catalog['Name']): @@ -151,7 +153,7 @@ def get_table(self, table_name): if table_name != "MSysObjects" and table_name in self.extra_props: props = self.extra_props[table_name] - return AccessTable(table, self.version, self.page_size, self._data_pages, self._table_defs, props) + return AccessTable(table, self.version, self.page_size, self._data_pages, self._table_defs, self._all_pages, props) def parse_lvprop(self, lvprop_raw): try: @@ -203,12 +205,13 @@ def print_database(self): class AccessTable(object): - def __init__(self, table, version, page_size, data_pages, table_defs, props=None): + def __init__(self, table, version, page_size, data_pages, table_defs, all_pages, props=None): self.version = version self.props = props self.page_size = page_size self._data_pages = data_pages self._table_defs = table_defs + self._all_pages = all_pages self.table = table self.parsed_table = defaultdict(list) self.columns, self.primary_keys, self.table_header = self._get_table_columns() @@ -226,9 +229,9 @@ def parse(self): data page to rows(records) and parse each record. :return defaultdict(list) with the parsed data -- table[column][row_index] """ - if not self.table.linked_pages: + if not self.table.owned_pages: return self.create_empty_table() - for data_chunk in self.table.linked_pages: + for data_chunk in self.table.owned_pages: original_data = data_chunk parsed_data = parse_data_page_header(original_data, version=self.version) @@ -450,6 +453,68 @@ def _parse_dynamic_length_data(self, original_record, relative_record_metadata, parsed_type = parse_type(column.type, relative_obj_data, len(relative_obj_data), version=self.version) self.parsed_table[col_name].append(parsed_type) + + def _get_usage_map(self,page_num,row_num): + + ##Need to define a version config + OFFSET_ROW_START = 10 if self.version == 3 else 14 + SIZE_ROW_LOCATION = 2 + OFFSET_MASK = 0x1FFF + OFFSET_USAGE_MAP_START = 5 + INVALID_PAGE_NUMBER = -1 + + #get page containing usage map info + table_buffer = self._data_pages[page_num*self.page_size] + + #prepare offsets to pick relevant info from table buffer + row_start_offset = OFFSET_ROW_START + (SIZE_ROW_LOCATION * row_num) + row_end_offset = OFFSET_ROW_START + (SIZE_ROW_LOCATION * (row_num - 1)) + + #find row start + row_start = parse_buffer_custom(table_buffer,row_start_offset,'Int16ul') & OFFSET_MASK + + #find row end + row_end = self.page_size if row_num == 0 else parse_buffer_custom(table_buffer,row_end_offset,'Int16ul') & OFFSET_MASK + + #limit buffer + table_buffer = table_buffer[:row_end] + + #map type + map_type = parse_buffer_custom(table_buffer,row_start,'Int8ul') + + #offset start + um_start_offset = row_start + OFFSET_USAGE_MAP_START + + ##inline handler processing + + max_inline_pages = (row_end - um_start_offset) * 8 + start_page = parse_buffer_custom(table_buffer,row_start+1,'Int32ul') + end_page = start_page + max_inline_pages + + ##process page array + filtered_buffer = table_buffer[um_start_offset:] + filtered_buffer_size = len(filtered_buffer) + page_numbers = [] + byteCount = 0 + + while byteCount < filtered_buffer_size: + b = filtered_buffer[byteCount:byteCount+1] + if b != b'\x00': + for i in range(8): + if ((int.from_bytes(b,'big') & (1 << i)) != 0): + pageNumberOffset = (byteCount * 8 + i) + pageNumber = (start_page + pageNumberOffset) if (pageNumberOffset >= 0) else INVALID_PAGE_NUMBER + if pageNumber < start_page or pageNumber > end_page: + #invalid page number + break + page_numbers.append(pageNumber) + byteCount += 1 + + return page_numbers + + + + def _get_table_columns(self): """ Parse columns for a specific table @@ -468,7 +533,15 @@ def _get_table_columns(self): version=self.version, ) + + #add usage maps from table referenced by table head + #The catalog level linked pages array can be out of date following deletes. so use table header info to find accurate usage maps. + self.table.owned_pages = [self._all_pages[pn * self.page_size] for pn in self._get_usage_map(table_header.row_page_map_page_number,table_header.row_page_map_row_number)] + self.table.free_space_pages = [self._all_pages[pn * self.page_size] for pn in self._get_usage_map(table_header.free_space_page_map_page_number,table_header.free_space_page_map_row_number)] + + # Merge Data back to table_header + table_header['index'] = parsed_data['real_index'] table_header['column'] = parsed_data['column'] table_header['column_names'] = parsed_data['column_names'] table_header['real_index_2'] = parsed_data['real_index_2'] @@ -588,7 +661,8 @@ def _get_overflow_record(self, record_pointer): record = record_page[start:] else: end = parsed_data.record_offsets[record_offset - 1] - if end & 0x8000 and (end & 0xff != 0): + + if end & 0x8000:# and (end & 0xff != 0): ##last byte check removed. stops valid end offsets from being parsed. end = end & 0xfff record = record_page[start: end] return record diff --git a/access_parser/parsing_primitives.py b/access_parser/parsing_primitives.py index 9feb9ac..1c31a9b 100644 --- a/access_parser/parsing_primitives.py +++ b/access_parser/parsing_primitives.py @@ -1,5 +1,5 @@ from construct import * - +from construct import Int32ul,Int16ul,Int8ul,Int16ub,Int8ub,Int24ul,Int32sl #explicit imports to help intellisense def version_specific(version, v3_subcon, v4_subcon): """ @@ -125,8 +125,12 @@ def parse_table_head(buffer, version=3): "column_count" / Int16ul, "index_count" / Int32ul, "real_index_count" / Int32ul, - "row_page_map" / Int32ul, - "free_space_page_map" / Int32ul, + "row_page_map_row_number" / Int8ul, + "row_page_map_page_number" / Int24ul, + #"row_page_map" / Int32ul, + "free_space_page_map_row_number" / Int8ul, + "free_space_page_map_page_number" / Int24ul, + #"free_space_page_map" / Int32ul, "tdef_header_end" / Tell).parse(buffer) @@ -259,3 +263,21 @@ def parse_relative_object_metadata_struct(buffer, variable_jump_tables_cnt=0, ve Int16ub)), "var_len_count" / version_specific(version, Int8ub, Int16ub), "relative_metadata_end" / Tell).parse(buffer) + +#helper unpacking function +def parse_buffer_custom(buffer,position,type): + '''Custom function to parse buffers using differet construct types + ------ + Jackcess Mapping for type variable + - get = 'Int8ul' + - getShort = 'Int16ul' + - getInt = 'Int32ul' + - get3ByteInt = 'Int24ul' + #Will add to this list as they come up + ''' + type = globals()[type] + + parser = Struct("value" / type) + buffer = buffer[position:] + result = parser.parse(buffer) + return result.value \ No newline at end of file From 29be08a288df7d1e6e4e188c894bf6882a6de40f Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:52:09 +0000 Subject: [PATCH 09/17] Record field counts, and null table manamagent tweaks. it's possible for null tables, records, and table headers to dissagree on column count. This can be down to columns being added to a table with existing data but where the records themselves are not updated. It can be down to a column being deleted from the table, in these cases there is a still a reference on the records that were present before the removal. etc. Needed to confirm confirm field count from record unpack instead of relying on header. Needed to tweak how has_value is set for null table column count mismatches. --- access_parser/access_parser.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 3bdb60f..5ce89b6 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -272,21 +272,23 @@ def _parse_row(self, record): """ original_record = record reverse_record = record[::-1] + + if self.version > 3: + field_count = struct.unpack_from("h", record)[0] + record = record[2:] + else: + field_count = struct.unpack_from("b", record)[0] + record = record[1:] # Records contain null bitmaps for columns. The number of bitmaps is the number of columns / 8 rounded up - null_table_len = (self.table_header.column_count + 7) // 8 + + null_table_len = (field_count + 7) // 8 if null_table_len and null_table_len < len(original_record): null_table = record[-null_table_len:] # Turn bitmap to a list of True False values null_table = [((null_table[i // 8]) & (1 << (i % 8))) != 0 for i in range(len(null_table) * 8)] else: - LOGGER.error(f"Failed to parse null table column count {self.table_header.column_count}") + LOGGER.error(f"Failed to parse null table column count {field_count}") return - if self.version > 3: - field_count = struct.unpack_from("h", record)[0] - record = record[2:] - else: - field_count = struct.unpack_from("b", record)[0] - record = record[1:] relative_records_column_map = {} # Iterate columns @@ -319,7 +321,8 @@ def _parse_fixed_length_data(self, original_record, column, null_table): # The only exception is BOOL fields which are encoded in the null table has_value = True if column.column_id > len(null_table): - LOGGER.warning("Invalid null table. Bool values may be wrong, deleted values may be shown in the db.") + #new column added after row creation, not covered by null mask, in this case has_value = false + has_value = False if column.type == TYPE_BOOLEAN: has_value = None else: @@ -405,7 +408,8 @@ def _parse_dynamic_length_data(self, original_record, relative_record_metadata, col_name = column.col_name_str has_value = True if column.column_id > len(null_table): - LOGGER.warning("Invalid null table. null values may be shown in the db.") + #New column with no data so map to false + has_value = False else: has_value = null_table[column.column_id] if not has_value: From cc5b3d6461c0f8eba85a9e414e18d0a8668d9639 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:54:21 +0000 Subject: [PATCH 10/17] variable column location fix for tables that are changed a lot, the variable column location doesnt always match the index location in he column_map. the parsed column metadata does include the variable_column_number to be used though. --- access_parser/access_parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 5ce89b6..004d08a 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -417,14 +417,14 @@ def _parse_dynamic_length_data(self, original_record, relative_record_metadata, continue if self.version == 3: - if i in relative_record_metadata.variable_length_jump_table: + if column.variable_column_number in relative_record_metadata.variable_length_jump_table: jump_table_addition += 0x100 - rel_start = relative_offsets[i] + rel_start = relative_offsets[column.variable_column_number] # If this is the last one use var_len_count as end offset - if i + 1 == len(relative_offsets): + if column.variable_column_number + 1 == len(relative_offsets): rel_end = relative_record_metadata.var_len_count else: - rel_end = relative_offsets[i + 1] + rel_end = relative_offsets[column.variable_column_number + 1] # if rel_start and rel_end are the same there is no data in this slot if rel_start == rel_end: From 9eb3885a5e63be0855a4a61c2e6431920d487340 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:57:10 +0000 Subject: [PATCH 11/17] column output order fix. Because fixed and variable columns are parsed in groups. the final output table matched the internal storage order rather than the presented order when viewed in access. minor change added to match expected field order. --- access_parser/access_parser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 004d08a..1fbe4e6 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -1,6 +1,6 @@ import logging import struct -from collections import defaultdict +from collections import defaultdict, OrderedDict from construct import ConstructError from tabulate import tabulate @@ -262,6 +262,11 @@ def parse(self): last_offset = rec_offset if record: self._parse_row(record) + + ## fix final output order + columns_sorted = OrderedDict(sorted(self.columns.items())) + reordered_parsed_table = OrderedDict([(column.col_name_str,self.parsed_table[column.col_name_str]) for i, column in columns_sorted.items()]) + self.parsed_table = reordered_parsed_table return self.parsed_table def _parse_row(self, record): From fc151fae0d035c3b0b4f3717e8adba5d0ee3c07a Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 7 Feb 2025 12:59:26 +0000 Subject: [PATCH 12/17] parsing empty tables following delete because of historical delete references, the parse function would still run against an empty table, to avoid column ordering issues, call create_empty_table instead. --- access_parser/access_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 1fbe4e6..180f753 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -262,7 +262,10 @@ def parse(self): last_offset = rec_offset if record: self._parse_row(record) - + + if len(self.parsed_table) == 0: ##All records deleted + return self.create_empty_table() + ## fix final output order columns_sorted = OrderedDict(sorted(self.columns.items())) reordered_parsed_table = OrderedDict([(column.col_name_str,self.parsed_table[column.col_name_str]) for i, column in columns_sorted.items()]) From 5f074636c880ab5dc227fd38ca82272fef15c3f5 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Tue, 13 May 2025 09:36:09 +0100 Subject: [PATCH 13/17] Introduction of global version constants mapping. expanded usage map functionality to handle reference maps streamlined db header parsing outstanding issues with example db for LVAL type 2. --- access_parser/access_parser.py | 322 +++++++++++++--- access_parser/jetformat.py | 671 +++++++++++++++++++++++++++++++++ access_parser/utils.py | 9 +- 3 files changed, 936 insertions(+), 66 deletions(-) create mode 100644 access_parser/jetformat.py diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 180f753..8dec102 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -8,7 +8,8 @@ from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \ ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP, parse_buffer_custom from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \ - TYPE_96_BIT_17_BYTES, TYPE_OLE + TYPE_96_BIT_17_BYTES, TYPE_OLE, SimpleVarLenMetadata +from .jetformat import BaseFormat, Jet3Format, PageTypes # Page sizes PAGE_SIZE_V3 = 0x800 @@ -25,6 +26,12 @@ SYSTEM_TABLE_FLAGS = [-0x80000000, -0x00000002, 0x80000000, 0x00000002] +# top‐2‐bit mask and length mask (30 bits) +LONG_VALUE_TYPE_MASK = 0xC0000000 +LONG_VALUE_LENGTH_MASK = ~LONG_VALUE_TYPE_MASK & 0xFFFFFFFF + +MAX_BYTE = 256 + LOGGER = logging.getLogger("access_parser") @@ -60,32 +67,28 @@ def parse_msys_table(self): msys_table['LvProp']) if value} return table_to_lval_memo - def _parse_file_header(self, db_data): + def _parse_file_header(self, db_data: bytes) -> None: """ - Parse the basic file header and determine the Access DB version based on the parsing results. - :param db_data: db file data + Inspect the first HEADER_LENGTH bytes of db_data, + detect the correct Jet/ACE format, and set: + - self.version : full format helper + - self.page_size : pulled from the format """ + # grab exactly the bytes we need + header_buf = db_data[:BaseFormat.HEADER_LENGTH] + + # 1) figure out which Format subclass applies try: - head = ACCESSHEADER.parse(db_data) - except ConstructError: - # This is a very minimal parsing of the header. If we fail this probable is not a valid mdb file - raise ValueError("Failed to parse DB file header. Check it is a valid access database") - version = head.jet_version - if version in NEW_VERSIONS: - if version == VERSION_4: - self.version = ALL_VERSIONS[VERSION_4] - elif version == VERSION_5: - self.version = ALL_VERSIONS[VERSION_5] - elif version == VERSION_2010: - self.version = ALL_VERSIONS[VERSION_2010] - self.page_size = PAGE_SIZE_V4 + fmt = BaseFormat.get_format_from_header(header_buf) + except ValueError as ve: + LOGGER.error(f"{ve}; defaulting to Jet3Format") + fmt = Jet3Format() - else: - if not version == VERSION_3: - LOGGER.error(f"Unknown database version {version} Trying to parse database as version 3") - self.version = ALL_VERSIONS[VERSION_3] - self.page_size = PAGE_SIZE_V3 - LOGGER.info(f"DataBase version {version}") + # 2) stash it on self for everything else to use + self.version = fmt + self.page_size = fmt.page_size + + LOGGER.info(f"Detected Access format: Jet{self.version}, page_size={self.page_size}") def _link_tables_to_data(self): """ @@ -469,18 +472,17 @@ def _parse_dynamic_length_data(self, original_record, relative_record_metadata, def _get_usage_map(self,page_num,row_num): ##Need to define a version config - OFFSET_ROW_START = 10 if self.version == 3 else 14 - SIZE_ROW_LOCATION = 2 OFFSET_MASK = 0x1FFF - OFFSET_USAGE_MAP_START = 5 INVALID_PAGE_NUMBER = -1 + MAP_TYPE_INLINE = 0 + MAP_TYPE_REFERENCE = 1 #get page containing usage map info - table_buffer = self._data_pages[page_num*self.page_size] + table_buffer = self._all_pages[page_num*self.page_size] #prepare offsets to pick relevant info from table buffer - row_start_offset = OFFSET_ROW_START + (SIZE_ROW_LOCATION * row_num) - row_end_offset = OFFSET_ROW_START + (SIZE_ROW_LOCATION * (row_num - 1)) + row_start_offset = self.version.OFFSET_ROW_START + (self.version.SIZE_ROW_LOCATION * row_num) + row_end_offset = self.version.OFFSET_ROW_START + (self.version.SIZE_ROW_LOCATION * (row_num - 1)) #find row start row_start = parse_buffer_custom(table_buffer,row_start_offset,'Int16ul') & OFFSET_MASK @@ -495,36 +497,92 @@ def _get_usage_map(self,page_num,row_num): map_type = parse_buffer_custom(table_buffer,row_start,'Int8ul') #offset start - um_start_offset = row_start + OFFSET_USAGE_MAP_START - - ##inline handler processing - - max_inline_pages = (row_end - um_start_offset) * 8 - start_page = parse_buffer_custom(table_buffer,row_start+1,'Int32ul') - end_page = start_page + max_inline_pages - - ##process page array - filtered_buffer = table_buffer[um_start_offset:] - filtered_buffer_size = len(filtered_buffer) - page_numbers = [] - byteCount = 0 + um_start_offset = row_start + self.version.OFFSET_USAGE_MAP_START + + if map_type == MAP_TYPE_INLINE: + ## Usage map whose map is written inline in the same page. For Jet4, this + ## type of map can usually contains a maximum of 512 pages. Free space maps + ## are always inline, used space maps may be inline or reference. It has a + ## start page, which all page numbers in its map are calculated as starting + ## from. + + ##inline handler processing + max_inline_pages = (row_end - um_start_offset) * 8 + start_page = parse_buffer_custom(table_buffer,row_start+1,'Int32ul') + end_page = start_page + max_inline_pages + + ##process page array + filtered_buffer = table_buffer[um_start_offset:] + filtered_buffer_size = len(filtered_buffer) + page_numbers = [] + byteCount = 0 + + while byteCount < filtered_buffer_size: + b = filtered_buffer[byteCount:byteCount+1] + if b != b'\x00': + for i in range(8): + if ((int.from_bytes(b,'big') & (1 << i)) != 0): + pageNumberOffset = (byteCount * 8 + i) + pageNumber = (start_page + pageNumberOffset) if (pageNumberOffset >= 0) else INVALID_PAGE_NUMBER + if pageNumber < start_page or pageNumber > end_page: + #invalid page number + break + page_numbers.append(pageNumber) + byteCount += 1 + + return page_numbers - while byteCount < filtered_buffer_size: - b = filtered_buffer[byteCount:byteCount+1] - if b != b'\x00': - for i in range(8): - if ((int.from_bytes(b,'big') & (1 << i)) != 0): - pageNumberOffset = (byteCount * 8 + i) - pageNumber = (start_page + pageNumberOffset) if (pageNumberOffset >= 0) else INVALID_PAGE_NUMBER - if pageNumber < start_page or pageNumber > end_page: - #invalid page number - break - page_numbers.append(pageNumber) - byteCount += 1 - - return page_numbers - - + elif map_type == MAP_TYPE_REFERENCE: + ## Usage map whose map is written across one or more entire separate pages + ## of page type USAGE_MAP. For Jet4, this type of map can contain 32736 + ## pages per reference page, and a maximum of 17 reference map pages for a + ## total maximum of 556512 pages (2 GB). + + ##reference handler processing + + max_pages_per_usage_map_page = ((self.version.page_size - self.version.OFFSET_USAGE_MAP_PAGE_DATA) * 8) + num_usage_pages = int((row_end - row_start - 1) / 4) + um_start_offset = self.version.OFFSET_USAGE_MAP_START + + start_page = 0 + end_page = (num_usage_pages * max_pages_per_usage_map_page) + + # there is no "start page" for a reference usage map, so we get an + # extra page reference on top of the number of page references that fit + # in the table + page_numbers = [] + for i in range(num_usage_pages): + map_page_pointer_offset = row_start + self.version.OFFSET_REFERENCE_MAP_PAGE_NUMBERS + (i * 4) + map_page_num = parse_buffer_custom(table_buffer,map_page_pointer_offset,'Int32ul') + if map_page_num > 0: + map_page_buffer = self._all_pages[map_page_num*self.version.page_size] + page_type = map_page_buffer[0] + if page_type != PageTypes.USAGE_MAP: + LOGGER.error(f"Looking for usage map at page {map_page_num}, but page type is {page_type}") + return + filtered_buffer = map_page_buffer[self.version.OFFSET_USAGE_MAP_PAGE_DATA:] + + #Process map + buffer_start_page = (max_pages_per_usage_map_page * i) + + filtered_buffer_size = len(filtered_buffer) + + byteCount = 0 + + while byteCount < filtered_buffer_size: + b = filtered_buffer[byteCount:byteCount+1] + if b != b'\x00': + for i in range(8): + if ((int.from_bytes(b,'big') & (1 << i)) != 0): + pageNumberOffset = (byteCount * 8 + i) + buffer_start_page + pageNumber = (start_page + pageNumberOffset) if (pageNumberOffset >= 0) else INVALID_PAGE_NUMBER + if pageNumber < start_page or pageNumber > end_page: + #invalid page number + break + page_numbers.append(pageNumber) + byteCount += 1 + + return page_numbers def _get_table_columns(self): @@ -548,8 +606,11 @@ def _get_table_columns(self): #add usage maps from table referenced by table head #The catalog level linked pages array can be out of date following deletes. so use table header info to find accurate usage maps. - self.table.owned_pages = [self._all_pages[pn * self.page_size] for pn in self._get_usage_map(table_header.row_page_map_page_number,table_header.row_page_map_row_number)] - self.table.free_space_pages = [self._all_pages[pn * self.page_size] for pn in self._get_usage_map(table_header.free_space_page_map_page_number,table_header.free_space_page_map_row_number)] + owned_pages_map = self._get_usage_map(table_header.row_page_map_page_number,table_header.row_page_map_row_number) + self.table.owned_pages = [self._all_pages[pn * self.page_size] for pn in owned_pages_map] + + free_space_pages_map = self._get_usage_map(table_header.free_space_page_map_page_number,table_header.free_space_page_map_row_number) + self.table.free_space_pages = [self._all_pages[pn * self.page_size] for pn in free_space_pages_map] # Merge Data back to table_header @@ -577,11 +638,14 @@ def _get_table_columns(self): # create a dict of index to column to make it easier to access. offset is used to make this zero based offset = min(x.column_index for x in columns) column_dict = {x.column_index - offset: x for x in columns} + # If column index is not unique try best effort if len(column_dict) != len(columns): # create a dict of id to column to make it easier to access column_dict = {x.column_id: x for x in columns} + column_dict = OrderedDict(sorted(column_dict.items())) + # Add the extra properties relevant for the column if self.props: for i, col in column_dict.items(): @@ -632,6 +696,8 @@ def _parse_memo(self, relative_obj_data, return_raw=False): memo_data = self._get_overflow_record(parsed_memo.record_pointer) else: LOGGER.debug("LVAL type 2") + if relative_obj_data == b':\x00:\x00:\x00.\x00.\x00.\x00': ###need to review process for LVAL type 2. sometimes works but this example has a record pointer greater than number of records on target page. + print('problem lval') rec_data = self._get_overflow_record(parsed_memo.record_pointer) next_page = struct.unpack("I", rec_data[:4])[0] # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data. @@ -656,7 +722,7 @@ def _get_overflow_record(self, record_pointer): """ record_offset = record_pointer & 0xff page_num = record_pointer >> 8 - record_page = self._data_pages.get(page_num * self.page_size) + record_page = self._all_pages.get(page_num * self.page_size) if not record_page: LOGGER.warning(f"Could not find overflow record data page overflow pointer: {record_pointer}") return @@ -678,3 +744,137 @@ def _get_overflow_record(self, record_pointer): end = end & 0xfff record = record_page[start: end] return record + + + + # starting point for a iterating parser to enable outputs to be streamed to avoid memory overflows. + def _new_parse_row(self, record): + """ + Reads the row data from the given row buffer. Leaves limit unchanged. + :param record: the current row data + :return: + """ + original_record = record + + # Records contain null bitmaps for columns. The number of bitmaps is the number of columns / 8 rounded up + null_table_len = (self.table_header.column_count + 7) // 8 + if null_table_len and null_table_len < len(original_record): + null_table = record[-null_table_len:] + # Turn bitmap to a list of True False values + null_table = [((null_table[i // 8]) & (1 << (i % 8))) != 0 for i in range(len(null_table) * 8)] + else: + LOGGER.error(f"Failed to parse null table column count {self.table_header.column_count}") + return + + if self.version.SIZE_ROW_VAR_COL_OFFSET != 2: + + jumpColOffsets = self._readJumpTableVarColOffsets(original_record,0,null_table_len) + + + for i, column in self.columns.items(): + + #get column name + column_name = column.col_name_str + + #Check nullmask + isNull = True if column.column_id >= self.table_header.column_count else null_table[column.column_id] + + # Boolean fields are encoded in the null table + if column.type == TYPE_BOOLEAN: + self.parsed_table[column_name].append(isNull) + continue + + # remaining columns marked as null in nullmask are recorded as None + if isNull: + self.parsed_table[column_name].append(None) + continue + + # prep variables for column parsing + rowStart = 0 + colDataPos = 0 + colDataLen = 0 + colDataType = column.type + + #if fixed length + if column.column_flags.fixed_length: + + #identify fixed length variables + dataStart = rowStart + self.version.OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET + colDataPos = dataStart + column.fixed_offset + colDataLen = column.length + + #if variable length + else: + + varDataStart = None + varDataEnd = None + + if self.version.SIZE_ROW_VAR_COL_OFFSET == 2: + + #read simple var length value + varColumnsOffsetPos = (len(original_record) - null_table_len - 4) - (column.variable_column_number * 2) + + varDataStart = parse_buffer_custom(original_record,varColumnsOffsetPos,'Int16ul') + varDataEnd = parse_buffer_custom(original_record,varColumnsOffsetPos,'Int16ul') + + else: + + #read jump-table based var length values + varDataStart = jumpColOffsets[column.variable_column_number] + varDataEnd = jumpColOffsets[column.variable_column_number + 1] + + #prepare variable length get + colDataPos = rowStart + varDataStart + colDataLen = varDataEnd - varDataStart + + if colDataLen <= 0: + # empty string/zero‑length + self.parsed_table[column_name].append("" if colDataType==TYPE_TEXT else b"") + continue + + data = original_record[colDataPos:colDataPos+colDataLen] + + # dispatch on your column.type + if colDataType in (TYPE_MEMO,TYPE_OLE): + value = self._parse_memo(data, return_raw=(colDataType == TYPE_OLE)) + elif colDataType == TYPE_96_BIT_17_BYTES: + scale = column.extra_props.get("scale", 6) + value = numeric_to_string(data, scale) + else: + # fallback to gerneral parse_type + value = parse_type(colDataType, data, version=self.version, props=column.extra_props or None) + + self.parsed_table[column_name].append(value) + + + + def _readJumpTableVarColOffsets(self, buffer, rowStart, nullMaskSize): + + # calculate offsets using jump-table info + rowEnd = rowStart + len(buffer) -1 + numVarCols = buffer[rowEnd - nullMaskSize] + + varColOffsets = [0] * (numVarCols + 1) + + rowLen = rowEnd - rowStart + 1 + numJumps = (rowLen - 1) // MAX_BYTE + colOffset = rowEnd - nullMaskSize - numJumps - 1 + + # if last jump is a dummy value, ignore it + if ((colOffset - rowStart - numVarCols) // MAX_BYTE) < numJumps: + numJumps -= 1 + + jumpsUsed = 0 + # Fill in each of the varColOffsets entries + for i in range(numVarCols + 1): + # Skip ahead in the jump table as long as the next jump byte equals i + while (jumpsUsed < numJumps and + buffer[rowEnd - nullMaskSize - jumpsUsed - 1] == i): + jumps_used += 1 + + # The low‐order part of the offset is at col_offset - i + low = buffer[colOffset - i] + # The high‐order is jumpsUsed * MAX_BYTE + varColOffsets[i] = low + (jumpsUsed * MAX_BYTE) + + return varColOffsets \ No newline at end of file diff --git a/access_parser/jetformat.py b/access_parser/jetformat.py new file mode 100644 index 0000000..ab0d30e --- /dev/null +++ b/access_parser/jetformat.py @@ -0,0 +1,671 @@ + + +import enum +import sys +import locale +from typing import ClassVar, Optional, Set, Type + + +class CodecType(enum.Enum): + NONE = enum.auto() + JET = enum.auto() + MSISAM = enum.auto() + OFFICE = enum.auto() + + +class DataType(enum.Enum): + BOOLEAN = enum.auto() + BYTE = enum.auto() + INT = enum.auto() + LONG = enum.auto() + FLOAT = enum.auto() + DOUBLE = enum.auto() + GUID = enum.auto() + SHORT_DATE_TIME = enum.auto() + MONEY = enum.auto() + NUMERIC = enum.auto() + TEXT = enum.auto() + MEMO = enum.auto() + BIG_INT = enum.auto() + EXT_DATE_TIME = enum.auto() + COMPLEX_TYPE = enum.auto() + +class PageTypes: + INVALID = 0 + DATA = 1 + TABLE_DEF = 2 + INDEX_NODE = 3 + INDEX_LEAF = 4 + USAGE_MAP = 5 + + +class BaseFormat: + """ + Base class declaring every Jet-format constant as a class attribute. + Subclasses simply override the ones that change. + """ + + # — Static JetFormat constants — + MAX_RECORD_SIZE: ClassVar[int] = 1900 + TEXT_FIELD_UNIT_SIZE: ClassVar[int] = 2 + TEXT_FIELD_MAX_LENGTH:ClassVar[int] = 255 * TEXT_FIELD_UNIT_SIZE + + PROPERTY_MAP_TYPES: ClassVar[list[bytes]] = [ + b"MR2\x00", # access 2000+ + b"KKD\x00" # access 97 + ] + + # the raw version byte in the header + VERSION_CODE: ClassVar[Optional[int]] = None + # numeric mapping to enable compatibility + VERSION_NUMBER: ClassVar[Optional[int]] = None + + # — Identity & capabilities — + name: ClassVar[str] = "UNKNOWN" + read_only: ClassVar[bool] = False + indexes_supported: ClassVar[bool] = False + codec_type: ClassVar[CodecType] = CodecType.NONE + page_size: ClassVar[int] = 0 + max_database_size: ClassVar[int] = 0 + + unsupported_data_types: ClassVar[Set[DataType]] = set() + unsupported_calc_types: ClassVar[Set[DataType]] = set() + + # — Header parsing — + OFFSET_VERSION: ClassVar[int] = 20 + HEADER_LENGTH: ClassVar[int] = 21 + OFFSET_ENGINE_NAME: ClassVar[int] = 0x04 + LENGTH_ENGINE_NAME: ClassVar[int] = 0x0F + MSISAM_ENGINE: ClassVar[bytes] = b"MSISAM Database" + + BASE_HEADER_MASK: ClassVar[bytes] = bytes([ + 0xB5,0x6F,0x03,0x62,0x61,0x08,0xC2,0x55, 0xEB,0xA9,0x67,0x72,0x43,0x3F,0x00,0x9C, + 0x7A,0x9F,0x90,0xFF,0x80,0x9A,0x31,0xC5, 0x79,0xBA,0xED,0x30,0xBC,0xDF,0xCC,0x9D, + 0x63,0xD9,0xE4,0xC3,0x7B,0x42,0xFB,0x8A, 0xBC,0x4E,0x86,0xFB,0xEC,0x37,0x5D,0x44, + 0x9C,0xFA,0xC6,0x5E,0x28,0xE6,0x13,0xB6, 0x8A,0x60,0x54,0x94,0x7B,0x36,0xF5,0x72, + 0xDF,0xB1,0x77,0xF4,0x13,0x43,0xCF,0xAF, 0xB1,0x33,0x34,0x61,0x79,0x5B,0x92,0xB5, + 0x7C,0x2A,0x05,0xF1,0x7C,0x99,0x01,0x1B, 0x98,0xFD,0x12,0x4F,0x4A,0x94,0x6C,0x3E, + 0x60,0x26,0x5F,0x95,0xF8,0xD0,0x89,0x24, 0x85,0x67,0xC6,0x1F,0x27,0x44,0xD2,0xEE, + 0xCF,0x65,0xED,0xFF,0x07,0xC7,0x46,0xA1, 0x78,0x16,0x0C,0xED,0xE9,0x2D,0x62,0xD4 + ]) + + # — All possible header‐level offsets & sizes (defaults) — + OFFSET_MASKED_HEADER: ClassVar[Optional[int]] = None + HEADER_MASK: ClassVar[Optional[bytes]] = None + OFFSET_HEADER_DATE: ClassVar[Optional[int]] = None + OFFSET_PASSWORD: ClassVar[Optional[int]] = None + SIZE_PASSWORD: ClassVar[Optional[int]] = None + OFFSET_SORT_ORDER: ClassVar[Optional[int]] = None + SIZE_SORT_ORDER: ClassVar[Optional[int]] = None + OFFSET_CODE_PAGE: ClassVar[Optional[int]] = None + OFFSET_ENCODING_KEY: ClassVar[Optional[int]] = None + + # — All possible data‐page / table / index constants (defaults) — + MAX_ROW_SIZE: ClassVar[Optional[int]] = None + DATA_PAGE_INITIAL_FREE_SPACE: ClassVar[Optional[int]] = None + + OFFSET_NEXT_TABLE_DEF_PAGE: ClassVar[Optional[int]] = None + OFFSET_NUM_ROWS: ClassVar[Optional[int]] = None + OFFSET_NEXT_AUTO_NUMBER: ClassVar[Optional[int]] = None + OFFSET_NEXT_COMPLEX_AUTO_NUMBER: ClassVar[Optional[int]] = None + + OFFSET_TABLE_TYPE: ClassVar[Optional[int]] = None + OFFSET_MAX_COLS: ClassVar[Optional[int]] = None + OFFSET_NUM_VAR_COLS: ClassVar[Optional[int]] = None + OFFSET_NUM_COLS: ClassVar[Optional[int]] = None + + OFFSET_NUM_INDEX_SLOTS: ClassVar[Optional[int]] = None + OFFSET_NUM_INDEXES: ClassVar[Optional[int]] = None + OFFSET_OWNED_PAGES: ClassVar[Optional[int]] = None + OFFSET_FREE_SPACE_PAGES: ClassVar[Optional[int]] = None + OFFSET_INDEX_DEF_BLOCK: ClassVar[Optional[int]] = None + + SIZE_INDEX_COLUMN_BLOCK: ClassVar[Optional[int]] = None + SIZE_INDEX_INFO_BLOCK: ClassVar[Optional[int]] = None + + OFFSET_COLUMN_TYPE: ClassVar[Optional[int]] = None + OFFSET_COLUMN_NUMBER: ClassVar[Optional[int]] = None + OFFSET_COLUMN_PRECISION: ClassVar[Optional[int]] = None + OFFSET_COLUMN_SCALE: ClassVar[Optional[int]] = None + OFFSET_COLUMN_SORT_ORDER: ClassVar[Optional[int]] = None + OFFSET_COLUMN_CODE_PAGE: ClassVar[Optional[int]] = None + OFFSET_COLUMN_COMPLEX_ID: ClassVar[Optional[int]] = None + OFFSET_COLUMN_FLAGS: ClassVar[Optional[int]] = None + OFFSET_COLUMN_EXT_FLAGS: ClassVar[Optional[int]] = None + OFFSET_COLUMN_LENGTH: ClassVar[Optional[int]] = None + OFFSET_COLUMN_VARIABLE_TABLE_INDEX: ClassVar[Optional[int]] = None + OFFSET_COLUMN_FIXED_DATA_OFFSET: ClassVar[Optional[int]] = None + OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET: ClassVar[Optional[int]] = None + + OFFSET_TABLE_DEF_LOCATION: ClassVar[Optional[int]] = None + OFFSET_ROW_START: ClassVar[Optional[int]] = None + OFFSET_USAGE_MAP_START: ClassVar[Optional[int]] = None + OFFSET_USAGE_MAP_PAGE_DATA: ClassVar[Optional[int]] = None + OFFSET_REFERENCE_MAP_PAGE_NUMBERS: ClassVar[Optional[int]] = None + + OFFSET_FREE_SPACE: ClassVar[Optional[int]] = None + OFFSET_NUM_ROWS_ON_DATA_PAGE: ClassVar[Optional[int]] = None + MAX_NUM_ROWS_ON_DATA_PAGE: ClassVar[Optional[int]] = None + + OFFSET_INDEX_COMPRESSED_BYTE_COUNT: ClassVar[Optional[int]] = None + OFFSET_INDEX_ENTRY_MASK: ClassVar[Optional[int]] = None + OFFSET_PREV_INDEX_PAGE: ClassVar[Optional[int]] = None + OFFSET_NEXT_INDEX_PAGE: ClassVar[Optional[int]] = None + OFFSET_CHILD_TAIL_INDEX_PAGE: ClassVar[Optional[int]] = None + + SIZE_INDEX_DEFINITION: ClassVar[Optional[int]] = None + SIZE_COLUMN_HEADER: ClassVar[Optional[int]] = None + SIZE_ROW_LOCATION: ClassVar[Optional[int]] = None + SIZE_LONG_VALUE_DEF: ClassVar[Optional[int]] = None + + MAX_INLINE_LONG_VALUE_SIZE: ClassVar[Optional[int]] = None + MAX_LONG_VALUE_ROW_SIZE: ClassVar[Optional[int]] = None + MAX_COMPRESSED_UNICODE_SIZE: ClassVar[Optional[int]] = None + + SIZE_TDEF_HEADER: ClassVar[Optional[int]] = None + SIZE_TDEF_TRAILER: ClassVar[Optional[int]] = None + SIZE_COLUMN_DEF_BLOCK: ClassVar[Optional[int]] = None + SIZE_INDEX_ENTRY_MASK: ClassVar[Optional[int]] = None + + SKIP_BEFORE_INDEX_FLAGS: ClassVar[Optional[int]] = None + SKIP_AFTER_INDEX_FLAGS: ClassVar[Optional[int]] = None + SKIP_BEFORE_INDEX_SLOT: ClassVar[Optional[int]] = None + SKIP_AFTER_INDEX_SLOT: ClassVar[Optional[int]] = None + SKIP_BEFORE_INDEX: ClassVar[Optional[int]] = None + + SIZE_NAME_LENGTH: ClassVar[Optional[int]] = None + SIZE_ROW_COLUMN_COUNT: ClassVar[Optional[int]] = None + SIZE_ROW_VAR_COL_OFFSET: ClassVar[Optional[int]] = None + + USAGE_MAP_TABLE_BYTE_LENGTH: ClassVar[Optional[int]] = None + + MAX_COLUMNS_PER_TABLE: ClassVar[Optional[int]] = None + MAX_INDEXES_PER_TABLE: ClassVar[Optional[int]] = None + MAX_TABLE_NAME_LENGTH: ClassVar[Optional[int]] = None + MAX_COLUMN_NAME_LENGTH: ClassVar[Optional[int]] = None + MAX_INDEX_NAME_LENGTH: ClassVar[Optional[int]] = None + + LEGACY_NUMERIC_INDEXES: ClassVar[Optional[bool]] = None + CHARSET: ClassVar[Optional[str]] = None + DEFAULT_SORT_ORDER: ClassVar[Optional[str]] = None + PROPERTY_MAP_TYPE: ClassVar[Optional[bytes]] = None + SIZE_TEXT_FIELD_UNIT: ClassVar[Optional[int]] = None + + ## compatibility functions to enable "version" to continue to be used as it was before: + def __str__(self): + """Return the VERSION_NUMBER when the object is converted to a string.""" + return str(self.VERSION_NUMBER) + + def __repr__(self): + """Return a more detailed representation for debugging.""" + return f"{self.__class__.__name__}(version={self.VERSION_NUMBER})" + + def __eq__(self, other): + """Enable direct comparison with numbers and other BaseFormat objects.""" + if isinstance(other, (int, float)): + return self.VERSION_NUMBER == other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER == other.VERSION_NUMBER + return NotImplemented + + def __int__(self): + """Allow conversion to integer.""" + return self.VERSION_NUMBER + + # Additional comparison methods for completeness + def __lt__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER < other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER < other.VERSION_NUMBER + return NotImplemented + + def __gt__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER > other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER > other.VERSION_NUMBER + return NotImplemented + + def __le__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER <= other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER <= other.VERSION_NUMBER + return NotImplemented + + def __ge__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER >= other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER >= other.VERSION_NUMBER + return NotImplemented + + @classmethod + def is_supported_data_type(cls, dt: DataType) -> bool: + return dt not in cls.unsupported_data_types + + @classmethod + def is_supported_calc_type(cls, dt: DataType) -> bool: + return dt not in cls.unsupported_calc_types + + @classmethod + def _all_subclasses(cls): + """ + Recursively yield all subclasses of this class. + """ + for sub in cls.__subclasses__(): + yield sub + yield from sub._all_subclasses() + + @classmethod + def get_format(cls, path: str) -> "BaseFormat": + hdr = open(path, "rb").read(cls.HEADER_LENGTH) + if len(hdr) < cls.HEADER_LENGTH: + raise IOError(f"Not a Jet database: {path!r}") + + # 1) try the raw version byte first (Jet4+) + raw_ver = hdr[cls.OFFSET_VERSION] + for sub in cls._all_subclasses(): + if getattr(sub, "VERSION_CODE", None) == raw_ver: + return sub() + + # 2) attempt unmask for Jet3 only + masked_ver = raw_ver ^ cls.BASE_HEADER_MASK[cls.OFFSET_VERSION] + from .jetformat import Jet3Format + if masked_ver == Jet3Format.VERSION_CODE: + return Jet3Format() + + # 3) fallback MSISAM by engine‐name + eng = hdr[cls.OFFSET_ENGINE_NAME: + cls.OFFSET_ENGINE_NAME + cls.LENGTH_ENGINE_NAME] + for sub in cls._all_subclasses(): + prefix = getattr(sub, "ENGINE_NAME_PREFIX", None) + if prefix and eng.startswith(prefix): + return sub() + + raise IOError(f"Unknown Jet version byte: raw=0x{raw_ver:02X}, masked=0x{masked_ver:02X}") + + + @classmethod + def get_format_from_header(cls, buf: bytes) -> "BaseFormat": + if len(buf) < cls.HEADER_LENGTH: + raise ValueError(f"Header buffer too small ({len(buf)} < {cls.HEADER_LENGTH})") + + raw_ver = buf[cls.OFFSET_VERSION] + + # 1) check raw byte → Jet4, Jet12, Jet14, Jet16, Jet17 + for sub in cls._all_subclasses(): + if getattr(sub, "VERSION_CODE", None) == raw_ver: + return sub() + + # 2) if raw byte wasn’t a match, unmask *just* for Jet3 detection + # (only Jet3 files use this mask) + masked_ver = raw_ver ^ cls.BASE_HEADER_MASK[cls.OFFSET_VERSION] + from .jetformat import Jet3Format + if masked_ver == Jet3Format.VERSION_CODE: + return Jet3Format() + + # 3) fallback: MSISAM + eng = buf[cls.OFFSET_ENGINE_NAME:cls.OFFSET_ENGINE_NAME + cls.LENGTH_ENGINE_NAME] + for sub in cls._all_subclasses(): + prefix = getattr(sub, "ENGINE_NAME_PREFIX", None) + if prefix and eng.startswith(prefix): + return sub() + + raise ValueError(f"Unknown Jet version byte: raw=0x{raw_ver:02X}, masked=0x{masked_ver:02X}") + + +# ---------------------------------------------------------------------- +# Jet3Format subclass: overrides *only* those attrs that Jet 3 needs +# ---------------------------------------------------------------------- +class Jet3Format(BaseFormat): + VERSION_CODE = 0x00 + VERSION_NUMBER = 3 + ENGINE_NAME_PREFIX = None + + # identity & capabilities + name = "3" + read_only = True + indexes_supported = True + codec_type = CodecType.JET + page_size = 2048 + max_database_size = 1 * 1024**3 + + unsupported_data_types = {DataType.COMPLEX_TYPE} + unsupported_calc_types = set() + + # header‐level + OFFSET_MASKED_HEADER = 24 + HEADER_MASK = BaseFormat.BASE_HEADER_MASK[:-2] + OFFSET_HEADER_DATE = -1 + OFFSET_PASSWORD = 66 + SIZE_PASSWORD = 20 + OFFSET_SORT_ORDER = 58 + SIZE_SORT_ORDER = 2 + OFFSET_CODE_PAGE = 60 + OFFSET_ENCODING_KEY = 62 + + # page/table/index + MAX_ROW_SIZE = 2012 + DATA_PAGE_INITIAL_FREE_SPACE = page_size - 14 + + OFFSET_NEXT_TABLE_DEF_PAGE = 4 + OFFSET_NUM_ROWS = 12 + OFFSET_NEXT_AUTO_NUMBER = 20 + OFFSET_NEXT_COMPLEX_AUTO_NUMBER = -1 + + OFFSET_TABLE_TYPE = 20 + OFFSET_MAX_COLS = 21 + OFFSET_NUM_VAR_COLS = 23 + OFFSET_NUM_COLS = 25 + + OFFSET_NUM_INDEX_SLOTS = 27 + OFFSET_NUM_INDEXES = 31 + OFFSET_OWNED_PAGES = 35 + OFFSET_FREE_SPACE_PAGES = 39 + OFFSET_INDEX_DEF_BLOCK = 43 + + SIZE_INDEX_COLUMN_BLOCK = 39 + SIZE_INDEX_INFO_BLOCK = 20 + + OFFSET_COLUMN_TYPE = 0 + OFFSET_COLUMN_NUMBER = 1 + OFFSET_COLUMN_PRECISION = 11 + OFFSET_COLUMN_SCALE = 12 + OFFSET_COLUMN_SORT_ORDER = 9 + OFFSET_COLUMN_CODE_PAGE = 11 + OFFSET_COLUMN_COMPLEX_ID = -1 + OFFSET_COLUMN_FLAGS = 13 + OFFSET_COLUMN_EXT_FLAGS = -1 + OFFSET_COLUMN_LENGTH = 16 + OFFSET_COLUMN_VARIABLE_TABLE_INDEX = 3 + OFFSET_COLUMN_FIXED_DATA_OFFSET = 14 + OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET= 1 + + OFFSET_TABLE_DEF_LOCATION = 4 + OFFSET_ROW_START = 10 + OFFSET_USAGE_MAP_START = 5 + OFFSET_USAGE_MAP_PAGE_DATA = 4 + OFFSET_REFERENCE_MAP_PAGE_NUMBERS = 1 + + OFFSET_FREE_SPACE = 2 + OFFSET_NUM_ROWS_ON_DATA_PAGE = 8 + MAX_NUM_ROWS_ON_DATA_PAGE = 255 + + OFFSET_INDEX_COMPRESSED_BYTE_COUNT = 20 + OFFSET_INDEX_ENTRY_MASK = 22 + OFFSET_PREV_INDEX_PAGE = 8 + OFFSET_NEXT_INDEX_PAGE = 12 + OFFSET_CHILD_TAIL_INDEX_PAGE = 16 + + SIZE_INDEX_DEFINITION = 8 + SIZE_COLUMN_HEADER = 18 + SIZE_ROW_LOCATION = 2 + SIZE_LONG_VALUE_DEF = 12 + MAX_INLINE_LONG_VALUE_SIZE = 64 + MAX_LONG_VALUE_ROW_SIZE = 2032 + MAX_COMPRESSED_UNICODE_SIZE = 1024 + + SIZE_TDEF_HEADER = 43 + SIZE_TDEF_TRAILER = 2 + SIZE_COLUMN_DEF_BLOCK = 25 + SIZE_INDEX_ENTRY_MASK = 226 + + SKIP_BEFORE_INDEX_FLAGS = 0 + SKIP_AFTER_INDEX_FLAGS = 0 + SKIP_BEFORE_INDEX_SLOT = 0 + SKIP_AFTER_INDEX_SLOT = 0 + SKIP_BEFORE_INDEX = 0 + + SIZE_NAME_LENGTH = 1 + SIZE_ROW_COLUMN_COUNT = 1 + SIZE_ROW_VAR_COL_OFFSET = 1 + + USAGE_MAP_TABLE_BYTE_LENGTH = 128 + + MAX_COLUMNS_PER_TABLE = 255 + MAX_INDEXES_PER_TABLE = 32 + MAX_TABLE_NAME_LENGTH = 64 + MAX_COLUMN_NAME_LENGTH = 64 + MAX_INDEX_NAME_LENGTH = 64 + + LEGACY_NUMERIC_INDEXES = True + CHARSET = 'cp1252' + DEFAULT_SORT_ORDER = None + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[1] + SIZE_TEXT_FIELD_UNIT = 1 + + + + +class SortOrder(enum.Enum): + """Placeholder for ColumnImpl.SortOrder""" + GENERAL_SORT_ORDER = enum.auto() + GENERAL_97_SORT_ORDER = enum.auto() + GENERAL_LEGACY_SORT_ORDER = enum.auto() + + +# ---------------------------------------------------------------------- +# Jet 4 (Access 2000/02/03 – Jet 4) +# ---------------------------------------------------------------------- +class Jet4Format(BaseFormat): + VERSION_CODE = 0x01 + VERSION_NUMBER = 4 + + name = "4" + read_only = False + indexes_supported = True + codec_type = CodecType.JET + page_size = 4096 + max_database_size = 2 * 1024**3 # 2 GB + + MAX_ROW_SIZE = 4060 + DATA_PAGE_INITIAL_FREE_SPACE = page_size - 14 + + OFFSET_MASKED_HEADER = 24 + HEADER_MASK = BaseFormat.BASE_HEADER_MASK + OFFSET_HEADER_DATE = 114 + OFFSET_PASSWORD = 66 + SIZE_PASSWORD = 40 + OFFSET_SORT_ORDER = 110 + SIZE_SORT_ORDER = 4 + OFFSET_CODE_PAGE = 60 + OFFSET_ENCODING_KEY = 62 + + OFFSET_NEXT_TABLE_DEF_PAGE = 4 + OFFSET_NUM_ROWS = 16 + OFFSET_NEXT_AUTO_NUMBER = 20 + OFFSET_NEXT_COMPLEX_AUTO_NUMBER = -1 + + OFFSET_TABLE_TYPE = 40 + OFFSET_MAX_COLS = 41 + OFFSET_NUM_VAR_COLS = 43 + OFFSET_NUM_COLS = 45 + + OFFSET_NUM_INDEX_SLOTS = 47 + OFFSET_NUM_INDEXES = 51 + OFFSET_OWNED_PAGES = 55 + OFFSET_FREE_SPACE_PAGES = 59 + OFFSET_INDEX_DEF_BLOCK = 63 + + SIZE_INDEX_COLUMN_BLOCK = 52 + SIZE_INDEX_INFO_BLOCK = 28 + + OFFSET_COLUMN_TYPE = 0 + OFFSET_COLUMN_NUMBER = 5 + OFFSET_COLUMN_PRECISION = 11 + OFFSET_COLUMN_SCALE = 12 + OFFSET_COLUMN_SORT_ORDER = 11 + OFFSET_COLUMN_CODE_PAGE = -1 + OFFSET_COLUMN_COMPLEX_ID = -1 + OFFSET_COLUMN_FLAGS = 15 + OFFSET_COLUMN_EXT_FLAGS = 16 + OFFSET_COLUMN_LENGTH = 23 + OFFSET_COLUMN_VARIABLE_TABLE_INDEX = 7 + OFFSET_COLUMN_FIXED_DATA_OFFSET = 21 + OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET = 2 + + OFFSET_TABLE_DEF_LOCATION = 4 + OFFSET_ROW_START = 14 + OFFSET_USAGE_MAP_START = 5 + OFFSET_USAGE_MAP_PAGE_DATA = 4 + OFFSET_REFERENCE_MAP_PAGE_NUMBERS = 1 + + OFFSET_FREE_SPACE = 2 + OFFSET_NUM_ROWS_ON_DATA_PAGE = 12 + MAX_NUM_ROWS_ON_DATA_PAGE = 255 + + OFFSET_INDEX_COMPRESSED_BYTE_COUNT = 24 + OFFSET_INDEX_ENTRY_MASK = 27 + OFFSET_PREV_INDEX_PAGE = 12 + OFFSET_NEXT_INDEX_PAGE = 16 + OFFSET_CHILD_TAIL_INDEX_PAGE = 20 + + SIZE_INDEX_DEFINITION = 12 + SIZE_COLUMN_HEADER = 25 + SIZE_ROW_LOCATION = 2 + SIZE_LONG_VALUE_DEF = 12 + MAX_INLINE_LONG_VALUE_SIZE = 64 + MAX_LONG_VALUE_ROW_SIZE = 4076 + MAX_COMPRESSED_UNICODE_SIZE = 1024 + + SIZE_TDEF_HEADER = 63 + SIZE_TDEF_TRAILER = 2 + SIZE_COLUMN_DEF_BLOCK = 25 + SIZE_INDEX_ENTRY_MASK = 453 + + SKIP_BEFORE_INDEX_FLAGS = 4 + SKIP_AFTER_INDEX_FLAGS = 5 + SKIP_BEFORE_INDEX_SLOT = 4 + SKIP_AFTER_INDEX_SLOT = 4 + SKIP_BEFORE_INDEX = 4 + + SIZE_NAME_LENGTH = 2 + SIZE_ROW_COLUMN_COUNT = 2 + SIZE_ROW_VAR_COL_OFFSET = 2 + + USAGE_MAP_TABLE_BYTE_LENGTH = 64 + + MAX_COLUMNS_PER_TABLE = 255 + MAX_INDEXES_PER_TABLE = 32 + MAX_TABLE_NAME_LENGTH = 64 + MAX_COLUMN_NAME_LENGTH = 64 + MAX_INDEX_NAME_LENGTH = 64 + + LEGACY_NUMERIC_INDEXES = True + CHARSET = "utf-16le" + DEFAULT_SORT_ORDER = SortOrder.GENERAL_97_SORT_ORDER + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[1] + SIZE_TEXT_FIELD_UNIT = 1 + + # from V3_UNSUPP_TYPES: {COMPLEX_TYPE, BIG_INT, EXT_DATE_TIME} + unsupported_data_types = { + DataType.COMPLEX_TYPE, + DataType.BIG_INT, + DataType.EXT_DATE_TIME + } + unsupported_calc_types = set() # no calculated types :contentReference[oaicite:2]{index=2} + + +# ---------------------------------------------------------------------- +# Jet 12 (Access 2007 – ACE12) builds on Jet4 +# ---------------------------------------------------------------------- +class Jet12Format(Jet4Format): + VERSION_CODE = 0x02 + VERSION_NUMBER = 5 + name = "12" + + codec_type = CodecType.OFFICE + legacy_numeric_indexes = False + + # from V12_UNSUPP_TYPES = {BIG_INT, EXT_DATE_TIME} + unsupported_data_types = { + DataType.BIG_INT, + DataType.EXT_DATE_TIME + } + + # only these two offsets changed: + offset_next_complex_auto_number = 28 + offset_column_complex_id = 11 + + # ACE 12 still doesn’t support complex type or calc types + unsupported_calc_types = { + dt for dt in DataType + } # no calculated types :contentReference[oaicite:3]{index=3} + + +# ---------------------------------------------------------------------- +# Jet 14 (Access 2010 – ACE14) inherits from Jet12 +# ---------------------------------------------------------------------- +class Jet14Format(Jet12Format): + VERSION_CODE = 0x03 + VERSION_NUMBER = 2010 + name = "14" + + DEFAULT_SORT_ORDER = SortOrder.GENERAL_SORT_ORDER + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[0] + + # ACE 14 supports the V14_CALC_TYPES: + _V14_CALC = { + DataType.BOOLEAN, DataType.BYTE, DataType.INT, DataType.LONG, + DataType.FLOAT, DataType.DOUBLE, DataType.GUID, + DataType.SHORT_DATE_TIME, DataType.MONEY, DataType.NUMERIC, + DataType.TEXT, DataType.MEMO + } + unsupported_calc_types = set(DataType) - _V14_CALC + + +# ---------------------------------------------------------------------- +# Jet 16 (Access 2013 – ACE16) inherits from Jet14 +# ---------------------------------------------------------------------- +class Jet16Format(Jet14Format): + VERSION_CODE = 0x05 + VERSION_NUMBER = 2013 + name = "16" + + # from V16_UNSUPP_TYPES = {EXT_DATE_TIME} + unsupported_data_types = { + DataType.EXT_DATE_TIME + } + + # ACE 16 adds BIG_INT calc support: + _V16_CALC = Jet14Format._V14_CALC.union({ DataType.BIG_INT }) + unsupported_calc_types = set(DataType) - _V16_CALC + + +# ---------------------------------------------------------------------- +# Jet 17 (Access 2016 – ACE17) inherits from Jet16 +# ---------------------------------------------------------------------- +class Jet17Format(Jet16Format): + VERSION_CODE = 0x06 + VERSION_NUMBER = 2016 + name = "17" + + # now supports everything + unsupported_data_types = set() + unsupported_calc_types = set() + + CHARSET = "utf-16le" # StandardCharsets.UTF_16LE + DEFAULT_SORT_ORDER = SortOrder.GENERAL_LEGACY_SORT_ORDER + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[0] + SIZE_TEXT_FIELD_UNIT = BaseFormat.TEXT_FIELD_UNIT_SIZE + + +# ---------------------------------------------------------------------- +# MSISAM (Access 95) reuses Jet4 layout but overrides codec & engine‐name +# ---------------------------------------------------------------------- +class MsisamFormat(Jet4Format): + VERSION_CODE = None + ENGINE_NAME_PREFIX = BaseFormat.MSISAM_ENGINE + + name = "MSISAM" + read_only = True + indexes_supported = False + codec_type = CodecType.MSISAM + page_size = 512 + max_database_size = 1 * 1024**2 # 1 MB + + # nothing at all is supported + unsupported_data_types = set(DataType) + unsupported_calc_types = set(DataType) \ No newline at end of file diff --git a/access_parser/utils.py b/access_parser/utils.py index 2c48a4a..3c8de34 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -4,6 +4,8 @@ import uuid import math from datetime import datetime, timedelta +from dataclasses import dataclass +from typing import List LOGGER = logging.getLogger("access_parser.utils") @@ -56,10 +58,7 @@ } # Character Encodings for Different Jet Versions -ENCODING_MAP = { - 3: "cp1252", # Jet 3.x (Access 97 and earlier) - 4: "utf-16-le" # Jet 4.x+ (Access 2000 and newer) -} + TEXT_COMPRESSION_HEADER = b'\xff\xfe' # https://stackoverflow.com/questions/45560782 @@ -164,7 +163,7 @@ def decodeUncompressedText(textBytes: bytes, dataStart: int, dataEnd: int, versi decoding errors are logged and replacement characters are used. :return: Decoded text string. """ - encoding = ENCODING_MAP.get(version, "utf-16-le") # Default to utf-16-le for unknown versions + encoding = version.CHARSET bytesToDecode = textBytes[dataStart:dataEnd] try: From a8b1a003b495dc7a43035c6993792f5640098642 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 16 May 2025 15:24:01 +0100 Subject: [PATCH 14/17] overhauled row parsing logic to be version aware. fine tuned table parser with relevant masks. fixed old reference to invalid class. --- access_parser/access_parser.py | 353 ++++++++------------------------- 1 file changed, 81 insertions(+), 272 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 8dec102..63dc8a7 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -8,7 +8,7 @@ from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \ ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP, parse_buffer_custom from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \ - TYPE_96_BIT_17_BYTES, TYPE_OLE, SimpleVarLenMetadata + TYPE_96_BIT_17_BYTES, TYPE_OLE from .jetformat import BaseFormat, Jet3Format, PageTypes # Page sizes @@ -226,247 +226,64 @@ def create_empty_table(self): parsed_table[column.col_name_str] = [] #changed to blank array to align to expected type if data was present. return parsed_table + def _clean_loc(self, x: int) -> int: + """ + Strip off the high-bit flags (0x8000 = deleted, 0x4000 = overflow) + to get the true 12-bit page offset. + """ + return x & 0x0FFF + + def parse(self): """ - This is the main table parsing function. We go through all of the data pages linked to the table, separate each - data page to rows(records) and parse each record. - :return defaultdict(list) with the parsed data -- table[column][row_index] + Main table parsing function. Iterates data pages, splits into rows, and streams parsed rows. + :return: OrderedDict of parsed columns """ if not self.table.owned_pages: return self.create_empty_table() - for data_chunk in self.table.owned_pages: - original_data = data_chunk - parsed_data = parse_data_page_header(original_data, version=self.version) - - last_offset = None - for rec_offset in parsed_data.record_offsets: - # Deleted row - Just skip it - if rec_offset & 0x8000: - last_offset = rec_offset & 0xfff + + for page_data in self.table.owned_pages: + parsed_page = parse_data_page_header(page_data, version=self.version) + # iterate each slot entry + for row_num, raw_loc in enumerate(parsed_page.record_offsets): + # skip deleted rows + if raw_loc & 0x8000: continue - # Overflow page - if rec_offset & 0x4000: - # overflow ptr is 4 bits flags, 12 bits ptr - rec_ptr_offset = rec_offset & 0xfff - # update last pointer to pointer without flags - last_offset = rec_ptr_offset - # The ptr is the offset in the current data page. we get a 4 byte record_pointer from that - overflow_rec_ptr = original_data[rec_ptr_offset:rec_ptr_offset + 4] - overflow_rec_ptr = struct.unpack(" 3: - field_count = struct.unpack_from("h", record)[0] - record = record[2:] - else: - field_count = struct.unpack_from("b", record)[0] - record = record[1:] - # Records contain null bitmaps for columns. The number of bitmaps is the number of columns / 8 rounded up - - null_table_len = (field_count + 7) // 8 - if null_table_len and null_table_len < len(original_record): - null_table = record[-null_table_len:] - # Turn bitmap to a list of True False values - null_table = [((null_table[i // 8]) & (1 << (i % 8))) != 0 for i in range(len(null_table) * 8)] - else: - LOGGER.error(f"Failed to parse null table column count {field_count}") - return - - relative_records_column_map = {} - # Iterate columns - for i, column in self.columns.items(): - # Fixed length columns are handled before variable length. If this is a variable length column add it to - # mapping and continue - if not column.column_flags.fixed_length: - relative_records_column_map[i] = column - continue - - self._parse_fixed_length_data(record, column, null_table) - if relative_records_column_map: - relative_records_column_map = dict(sorted(relative_records_column_map.items())) - metadata = self._parse_dynamic_length_records_metadata(reverse_record, original_record, - null_table_len) - if not metadata: - return - if metadata.variable_length_field_offsets: - self._parse_dynamic_length_data(original_record, metadata, relative_records_column_map, null_table) - - def _parse_fixed_length_data(self, original_record, column, null_table): - """ - Parse fixed-length data from record - :param original_record: unmodified record - :param column: column this data belongs to - :param null_table: null table of the row - """ - column_name = column.col_name_str - # The null table indicates null values in the row. - # The only exception is BOOL fields which are encoded in the null table - has_value = True - if column.column_id > len(null_table): - #new column added after row creation, not covered by null mask, in this case has_value = false - has_value = False - if column.type == TYPE_BOOLEAN: - has_value = None - else: - has_value = null_table[column.column_id] - # Boolean fields are encoded in the null table - if column.type == TYPE_BOOLEAN: - parsed_type = has_value - else: - if column.fixed_offset > len(original_record): - LOGGER.error(f"Column offset is bigger than the length of the record {column.fixed_offset}") - return - record = original_record[column.fixed_offset:] - parsed_type = parse_type(column.type, record, version=self.version, props=column.extra_props or None) - if not has_value: - self.parsed_table[column_name].append(None) - return - self.parsed_table[column_name].append(parsed_type) - def _parse_dynamic_length_records_metadata(self, reverse_record, original_record, null_table_length): - """ - parse the metadata of relative records. The metadata used to parse relative records is found at the end of the - record so reverse_record is used for parsing from the bottom up. - :param reverse_record: original record in reverse - :param original_record: unmodified record - :param null_table_length: - :return: parsed relative record metadata - """ - if self.version > 3: - reverse_record = reverse_record[null_table_length:] - return parse_relative_object_metadata_struct(reverse_record, version=self.version) - # Parse relative metadata. - # Metadata is from the end of the record(reverse_record is used here) - variable_length_jump_table_cnt = (len(original_record) - 1) // 256 - reverse_record = reverse_record[null_table_length:] - try: - relative_record_metadata = parse_relative_object_metadata_struct(reverse_record, - variable_length_jump_table_cnt, - self.version) - # relative_record_metadata = RELATIVE_OBJS.parse(reverse_record) - # we use this offset in original_record so we have to update the length with the null_tables - relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + null_table_length - except ConstructError: - relative_record_metadata = None - LOGGER.error("Failed parsing record") - - if relative_record_metadata and \ - relative_record_metadata.variable_length_field_count != self.table_header.variable_columns: - - # best effort - try to find variable column count in the record and parse from there - # this is limited to the 10 first bytes to reduce false positives. - # most of the time iv'e seen this there was an extra DWORD before the actual metadata - metadata_start = reverse_record.find(bytes([self.table_header.variable_columns])) - if metadata_start != -1 and metadata_start < 10: - reverse_record = reverse_record[metadata_start:] - try: - relative_record_metadata = parse_relative_object_metadata_struct(reverse_record, - variable_length_jump_table_cnt, - self.version) - except ConstructError: - LOGGER.error(f"Failed to parse record metadata: {original_record}") - relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + \ - metadata_start - else: - LOGGER.warning( - f"Record did not parse correctly. Number of columns: {self.table_header.variable_columns}" - f" number of parsed columns: {relative_record_metadata.variable_length_field_count}") - return None - return relative_record_metadata - - def _parse_dynamic_length_data(self, original_record, relative_record_metadata, - relative_records_column_map, null_table): - """ - Parse dynamic (non fixed length) records from row - :param original_record: full unmodified record - :param relative_record_metadata: parsed record metadata - :param relative_records_column_map: relative records colum mapping {index: column} - :param null_table: list indicating which columns have null value - """ - relative_offsets = relative_record_metadata.variable_length_field_offsets - jump_table_addition = 0 - for i, column_index in enumerate(relative_records_column_map): - column = relative_records_column_map[column_index] - col_name = column.col_name_str - has_value = True - if column.column_id > len(null_table): - #New column with no data so map to false - has_value = False - else: - has_value = null_table[column.column_id] - if not has_value: - self.parsed_table[col_name].append(None) - continue - - if self.version == 3: - if column.variable_column_number in relative_record_metadata.variable_length_jump_table: - jump_table_addition += 0x100 - rel_start = relative_offsets[column.variable_column_number] - # If this is the last one use var_len_count as end offset - if column.variable_column_number + 1 == len(relative_offsets): - rel_end = relative_record_metadata.var_len_count - else: - rel_end = relative_offsets[column.variable_column_number + 1] + # if all rows deleted + if not self.parsed_table: + return self.create_empty_table() - # if rel_start and rel_end are the same there is no data in this slot - if rel_start == rel_end: - self.parsed_table[col_name].append("") - continue + # reorder columns in output + columns_sorted = OrderedDict(sorted(self.columns.items(), key=lambda t: t[0])) + reordered = OrderedDict( + (col.col_name_str, self.parsed_table[col.col_name_str]) + for _, col in columns_sorted.items() + ) + self.parsed_table = reordered + return self.parsed_table - relative_obj_data = original_record[rel_start + jump_table_addition: rel_end + jump_table_addition] - # Parse types that require column data here, call parse_type on all other types - if column.type == TYPE_MEMO: - try: - parsed_type = self._parse_memo(relative_obj_data) - except ConstructError: - LOGGER.warning("Failed to parse memo field. Using data as bytes") - parsed_type = relative_obj_data - elif column.type == TYPE_OLE: - try: - parsed_type = self._parse_memo(relative_obj_data, return_raw=True) - except ConstructError: - LOGGER.warning("Failed to parse OLE field. Using data as bytes") - parsed_type = relative_obj_data - elif column.type == TYPE_96_BIT_17_BYTES: - if len(relative_obj_data) != 17: - LOGGER.warning(f"Relative numeric field has invalid length {len(relative_obj_data)}, expected 17") - parsed_type = relative_obj_data - else: - # Get scale or None - scale = column.get('various', {}).get('scale', 6) - parsed_type = numeric_to_string(relative_obj_data, scale) - else: - parsed_type = parse_type(column.type, relative_obj_data, len(relative_obj_data), version=self.version) - self.parsed_table[col_name].append(parsed_type) def _get_usage_map(self,page_num,row_num): @@ -696,64 +513,55 @@ def _parse_memo(self, relative_obj_data, return_raw=False): memo_data = self._get_overflow_record(parsed_memo.record_pointer) else: LOGGER.debug("LVAL type 2") - if relative_obj_data == b':\x00:\x00:\x00.\x00.\x00.\x00': ###need to review process for LVAL type 2. sometimes works but this example has a record pointer greater than number of records on target page. - print('problem lval') rec_data = self._get_overflow_record(parsed_memo.record_pointer) - next_page = struct.unpack("I", rec_data[:4])[0] - # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data. - # Concat the data until we get a 0 next_page. - memo_data = b"" - while next_page: - memo_data += rec_data[4:] - rec_data = self._get_overflow_record(next_page) + #adding a workaround until lval type 2 issue resolved. + if rec_data: next_page = struct.unpack("I", rec_data[:4])[0] - memo_data += rec_data[4:] + # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data. + # Concat the data until we get a 0 next_page. + memo_data = b"" + while next_page: + memo_data += rec_data[4:] + rec_data = self._get_overflow_record(next_page) + next_page = struct.unpack("I", rec_data[:4])[0] + memo_data += rec_data[4:] + else: + memo_data = b"" if memo_data: if return_raw: return memo_data parsed_type = parse_type(memo_type, memo_data, len(memo_data), version=self.version) return parsed_type - def _get_overflow_record(self, record_pointer): - """ - Get the actual record from a record pointer - :param record_pointer: - :return: record - """ - record_offset = record_pointer & 0xff - page_num = record_pointer >> 8 - record_page = self._all_pages.get(page_num * self.page_size) - if not record_page: - LOGGER.warning(f"Could not find overflow record data page overflow pointer: {record_pointer}") - return - parsed_data = parse_data_page_header(record_page, version=self.version) - if record_offset > len(parsed_data.record_offsets): - LOGGER.warning("Failed parsing overflow record offset") - return - start = parsed_data.record_offsets[record_offset] - if start & 0x8000: - start = start & 0xfff - else: - LOGGER.debug(f"Overflow record flag is not present {start}") - if record_offset == 0: - record = record_page[start:] - else: - end = parsed_data.record_offsets[record_offset - 1] + def _get_overflow_record(self, record_pointer: int): + slot_index = record_pointer & 0xFF + page_num = record_pointer >> 8 + page_data = self._all_pages.get(page_num * self.page_size) + if page_data is None: + LOGGER.warning(f"Missing overflow page for pointer {record_pointer}") + return None - if end & 0x8000:# and (end & 0xff != 0): ##last byte check removed. stops valid end offsets from being parsed. - end = end & 0xfff - record = record_page[start: end] - return record + parsed_page = parse_data_page_header(page_data, version=self.version) + raw_loc = parsed_page.record_offsets[slot_index] + # ─── SLICE OUT THE TRUE ROW ───────────────────────────────────── + start = self._clean_loc(raw_loc) + if slot_index == 0: + end = self.page_size + else: + end = self._clean_loc(parsed_page.record_offsets[slot_index - 1]) + + return page_data[start:end] # starting point for a iterating parser to enable outputs to be streamed to avoid memory overflows. - def _new_parse_row(self, record): + def _parse_row(self, record): """ Reads the row data from the given row buffer. Leaves limit unchanged. :param record: the current row data :return: """ + original_record = record # Records contain null bitmaps for columns. The number of bitmaps is the number of columns / 8 rounded up @@ -761,11 +569,12 @@ def _new_parse_row(self, record): if null_table_len and null_table_len < len(original_record): null_table = record[-null_table_len:] # Turn bitmap to a list of True False values - null_table = [((null_table[i // 8]) & (1 << (i % 8))) != 0 for i in range(len(null_table) * 8)] + null_table = [((null_table[i // 8]) & (1 << (i % 8))) == 0 for i in range(len(null_table) * 8)] else: LOGGER.error(f"Failed to parse null table column count {self.table_header.column_count}") return + if self.version.SIZE_ROW_VAR_COL_OFFSET != 2: jumpColOffsets = self._readJumpTableVarColOffsets(original_record,0,null_table_len) @@ -815,7 +624,7 @@ def _new_parse_row(self, record): varColumnsOffsetPos = (len(original_record) - null_table_len - 4) - (column.variable_column_number * 2) varDataStart = parse_buffer_custom(original_record,varColumnsOffsetPos,'Int16ul') - varDataEnd = parse_buffer_custom(original_record,varColumnsOffsetPos,'Int16ul') + varDataEnd = parse_buffer_custom(original_record,varColumnsOffsetPos-2,'Int16ul') else: @@ -842,7 +651,7 @@ def _new_parse_row(self, record): value = numeric_to_string(data, scale) else: # fallback to gerneral parse_type - value = parse_type(colDataType, data, version=self.version, props=column.extra_props or None) + value = parse_type(colDataType, data, colDataLen, version=self.version, props=column.extra_props or None) self.parsed_table[column_name].append(value) From 1c3760a49f87a5846323abe500bf4143f38b1981 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Fri, 23 May 2025 13:54:28 +0100 Subject: [PATCH 15/17] numeric scale could be in a different property. Fix applied. --- access_parser/access_parser.py | 14 +------------- access_parser/utils.py | 2 -- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 63dc8a7..d1f17a7 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -11,18 +11,6 @@ TYPE_96_BIT_17_BYTES, TYPE_OLE from .jetformat import BaseFormat, Jet3Format, PageTypes -# Page sizes -PAGE_SIZE_V3 = 0x800 -PAGE_SIZE_V4 = 0x1000 - -# Versions -VERSION_3 = 0x00 -VERSION_4 = 0x01 -VERSION_5 = 0x02 -VERSION_2010 = 0x03 - -ALL_VERSIONS = {VERSION_3: 3, VERSION_4: 4, VERSION_5: 5, VERSION_2010: 2010} -NEW_VERSIONS = [VERSION_4, VERSION_5, VERSION_2010] SYSTEM_TABLE_FLAGS = [-0x80000000, -0x00000002, 0x80000000, 0x00000002] @@ -647,7 +635,7 @@ def _parse_row(self, record): if colDataType in (TYPE_MEMO,TYPE_OLE): value = self._parse_memo(data, return_raw=(colDataType == TYPE_OLE)) elif colDataType == TYPE_96_BIT_17_BYTES: - scale = column.extra_props.get("scale", 6) + scale = column.extra_props.get("scale", column.various.get("scale",6)) value = numeric_to_string(data, scale) else: # fallback to gerneral parse_type diff --git a/access_parser/utils.py b/access_parser/utils.py index 3c8de34..55c9a10 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -4,8 +4,6 @@ import uuid import math from datetime import datetime, timedelta -from dataclasses import dataclass -from typing import List LOGGER = logging.getLogger("access_parser.utils") From c2e760e1bed16592d6fb55c96d2e1bf8534d56a4 Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Tue, 24 Jun 2025 16:36:25 +0100 Subject: [PATCH 16/17] Limiting to 12-bit output was stripping off the pointer if all rows prior to current row were deleted. Without that pointer the end offset would return 0 which would typicallly be less than start and a blank row object would be parsed rather than the actual data. with the extra bit, offsets like 53248 would return 4096 (page limit) instead. Which is what we're after. --- access_parser/access_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index d1f17a7..913ab70 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -217,9 +217,9 @@ def create_empty_table(self): def _clean_loc(self, x: int) -> int: """ Strip off the high-bit flags (0x8000 = deleted, 0x4000 = overflow) - to get the true 12-bit page offset. + to get the true 13-bit page offset. """ - return x & 0x0FFF + return x & 0x1FFF def parse(self): From 21995608621fd9be978711f2e82f3a1bd48d069f Mon Sep 17 00:00:00 2001 From: Jamie Stooke Date: Tue, 24 Jun 2025 16:36:59 +0100 Subject: [PATCH 17/17] switched to dict mapping for pages for better visibility while debugging. --- access_parser/access_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 913ab70..cad7b14 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -28,8 +28,8 @@ def __init__(self, offset, val): self.value = val self.offset = offset self.linked_pages = [] - self.owned_pages = [] - self.free_space_pages = [] + self.owned_pages = {} + self.free_space_pages = {} class AccessParser(object): @@ -230,7 +230,7 @@ def parse(self): if not self.table.owned_pages: return self.create_empty_table() - for page_data in self.table.owned_pages: + for page_num, page_data in self.table.owned_pages.items(): parsed_page = parse_data_page_header(page_data, version=self.version) # iterate each slot entry for row_num, raw_loc in enumerate(parsed_page.record_offsets): @@ -412,10 +412,10 @@ def _get_table_columns(self): #add usage maps from table referenced by table head #The catalog level linked pages array can be out of date following deletes. so use table header info to find accurate usage maps. owned_pages_map = self._get_usage_map(table_header.row_page_map_page_number,table_header.row_page_map_row_number) - self.table.owned_pages = [self._all_pages[pn * self.page_size] for pn in owned_pages_map] + self.table.owned_pages = {pn: self._all_pages[pn * self.page_size] for pn in owned_pages_map} free_space_pages_map = self._get_usage_map(table_header.free_space_page_map_page_number,table_header.free_space_page_map_row_number) - self.table.free_space_pages = [self._all_pages[pn * self.page_size] for pn in free_space_pages_map] + self.table.free_space_pages = {pn: self._all_pages[pn * self.page_size] for pn in free_space_pages_map} # Merge Data back to table_header