diff --git a/README.md b/README.md index 678a7c1..e9e8b1c 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,19 @@ Use pip: `pip install access-parser` Or install manually: ```bash +# Clone the repository git clone https://github.com/ClarotyICS/access_parser.git cd access_parser -python3 setup.py install + +# Create a virtual environment (recommended) +python3 -m venv .venv # On Windows use: py -m venv .venv +source .venv/bin/activate # On Windows use: .venv\Scripts\activate + +# Install using pip (modern approach) +pip install . + +# Verify installation +python -c "import access_parser; print('Installed successfully')" ``` # Demo diff --git a/access_parser/access_parser.py b/access_parser/access_parser.py index 2cdfc09..cad7b14 100644 --- a/access_parser/access_parser.py +++ b/access_parser/access_parser.py @@ -1,29 +1,24 @@ import logging import struct -from collections import defaultdict +from collections import defaultdict, OrderedDict from construct import ConstructError from tabulate import tabulate from .parsing_primitives import parse_relative_object_metadata_struct, parse_table_head, parse_data_page_header, \ - ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP + ACCESSHEADER, MEMO, parse_table_data, TDEF_HEADER, LVPROP, parse_buffer_custom from .utils import categorize_pages, parse_type, TYPE_MEMO, TYPE_TEXT, TYPE_BOOLEAN, read_db_file, numeric_to_string, \ - TYPE_96_bit_17_BYTES, TYPE_OLE + TYPE_96_BIT_17_BYTES, TYPE_OLE +from .jetformat import BaseFormat, Jet3Format, PageTypes -# Page sizes -PAGE_SIZE_V3 = 0x800 -PAGE_SIZE_V4 = 0x1000 -# Versions -VERSION_3 = 0x00 -VERSION_4 = 0x01 -VERSION_5 = 0x02 -VERSION_2010 = 0x03 +SYSTEM_TABLE_FLAGS = [-0x80000000, -0x00000002, 0x80000000, 0x00000002] -ALL_VERSIONS = {VERSION_3: 3, VERSION_4: 4, VERSION_5: 5, VERSION_2010: 2010} -NEW_VERSIONS = [VERSION_4, VERSION_5, VERSION_2010] +# top‐2‐bit mask and length mask (30 bits) +LONG_VALUE_TYPE_MASK = 0xC0000000 +LONG_VALUE_LENGTH_MASK = ~LONG_VALUE_TYPE_MASK & 0xFFFFFFFF -SYSTEM_TABLE_FLAGS = [-0x80000000, -0x00000002, 0x80000000, 0x00000002] +MAX_BYTE = 256 LOGGER = logging.getLogger("access_parser") @@ -33,11 +28,16 @@ def __init__(self, offset, val): self.value = val self.offset = offset self.linked_pages = [] + self.owned_pages = {} + self.free_space_pages = {} class AccessParser(object): def __init__(self, db_path): - self.db_data = read_db_file(db_path) + if isinstance(db_path, bytes): # allow to pass bytes object e.g. downloaded from cloud storage + self.db_data = db_path + else: + self.db_data = read_db_file(db_path) self._parse_file_header(self.db_data) self._table_defs, self._data_pages, self._all_pages = categorize_pages(self.db_data, self.page_size) self._tables_with_data = self._link_tables_to_data() @@ -55,32 +55,28 @@ def parse_msys_table(self): msys_table['LvProp']) if value} return table_to_lval_memo - def _parse_file_header(self, db_data): + def _parse_file_header(self, db_data: bytes) -> None: """ - Parse the basic file header and determine the Access DB version based on the parsing results. - :param db_data: db file data + Inspect the first HEADER_LENGTH bytes of db_data, + detect the correct Jet/ACE format, and set: + - self.version : full format helper + - self.page_size : pulled from the format """ + # grab exactly the bytes we need + header_buf = db_data[:BaseFormat.HEADER_LENGTH] + + # 1) figure out which Format subclass applies try: - head = ACCESSHEADER.parse(db_data) - except ConstructError: - # This is a very minimal parsing of the header. If we fail this probable is not a valid mdb file - raise ValueError("Failed to parse DB file header. Check it is a valid access database") - version = head.jet_version - if version in NEW_VERSIONS: - if version == VERSION_4: - self.version = ALL_VERSIONS[VERSION_4] - elif version == VERSION_5: - self.version = ALL_VERSIONS[VERSION_5] - elif version == VERSION_2010: - self.version = ALL_VERSIONS[VERSION_2010] - self.page_size = PAGE_SIZE_V4 + fmt = BaseFormat.get_format_from_header(header_buf) + except ValueError as ve: + LOGGER.error(f"{ve}; defaulting to Jet3Format") + fmt = Jet3Format() - else: - if not version == VERSION_3: - LOGGER.error(f"Unknown database version {version} Trying to parse database as version 3") - self.version = ALL_VERSIONS[VERSION_3] - self.page_size = PAGE_SIZE_V3 - LOGGER.info(f"DataBase version {version}") + # 2) stash it on self for everything else to use + self.version = fmt + self.page_size = fmt.page_size + + LOGGER.info(f"Detected Access format: Jet{self.version}, page_size={self.page_size}") def _link_tables_to_data(self): """ @@ -110,7 +106,7 @@ def _parse_catalog(self): :return: dict {table : offset} """ catalog_page = self._tables_with_data[2 * self.page_size] - access_table = AccessTable(catalog_page, self.version, self.page_size, self._data_pages, self._table_defs) + access_table = AccessTable(catalog_page, self.version, self.page_size, self._data_pages, self._table_defs, self._all_pages) catalog = access_table.parse() tables_mapping = {} for i, table_name in enumerate(catalog['Name']): @@ -148,7 +144,7 @@ def get_table(self, table_name): if table_name != "MSysObjects" and table_name in self.extra_props: props = self.extra_props[table_name] - return AccessTable(table, self.version, self.page_size, self._data_pages, self._table_defs, props) + return AccessTable(table, self.version, self.page_size, self._data_pages, self._table_defs, self._all_pages, props) def parse_lvprop(self, lvprop_raw): try: @@ -200,12 +196,13 @@ def print_database(self): class AccessTable(object): - def __init__(self, table, version, page_size, data_pages, table_defs, props=None): + def __init__(self, table, version, page_size, data_pages, table_defs, all_pages, props=None): self.version = version self.props = props self.page_size = page_size self._data_pages = data_pages self._table_defs = table_defs + self._all_pages = all_pages self.table = table self.parsed_table = defaultdict(list) self.columns, self.primary_keys, self.table_header = self._get_table_columns() @@ -214,238 +211,184 @@ def create_empty_table(self): parsed_table = defaultdict(list) columns, *_ = self._get_table_columns() for i, column in columns.items(): - parsed_table[column.col_name_str] = "" + parsed_table[column.col_name_str] = [] #changed to blank array to align to expected type if data was present. return parsed_table + def _clean_loc(self, x: int) -> int: + """ + Strip off the high-bit flags (0x8000 = deleted, 0x4000 = overflow) + to get the true 13-bit page offset. + """ + return x & 0x1FFF + + def parse(self): """ - This is the main table parsing function. We go through all of the data pages linked to the table, separate each - data page to rows(records) and parse each record. - :return defaultdict(list) with the parsed data -- table[column][row_index] + Main table parsing function. Iterates data pages, splits into rows, and streams parsed rows. + :return: OrderedDict of parsed columns """ - if not self.table.linked_pages: + if not self.table.owned_pages: return self.create_empty_table() - for data_chunk in self.table.linked_pages: - original_data = data_chunk - parsed_data = parse_data_page_header(original_data, version=self.version) - - last_offset = None - for rec_offset in parsed_data.record_offsets: - # Deleted row - Just skip it - if rec_offset & 0x8000: - last_offset = rec_offset & 0xfff + + for page_num, page_data in self.table.owned_pages.items(): + parsed_page = parse_data_page_header(page_data, version=self.version) + # iterate each slot entry + for row_num, raw_loc in enumerate(parsed_page.record_offsets): + # skip deleted rows + if raw_loc & 0x8000: continue - # Overflow page - if rec_offset & 0x4000: - # overflow ptr is 4 bits flags, 12 bits ptr - rec_ptr_offset = rec_offset & 0xfff - # update last pointer to pointer without flags - last_offset = rec_ptr_offset - # The ptr is the offset in the current data page. we get a 4 byte record_pointer from that - overflow_rec_ptr = original_data[rec_ptr_offset:rec_ptr_offset + 4] - overflow_rec_ptr = struct.unpack(" 3: - field_count = struct.unpack_from("h", record)[0] - record = record[2:] - else: - field_count = struct.unpack_from("b", record)[0] - record = record[1:] - - relative_records_column_map = {} - # Iterate columns - for i, column in self.columns.items(): - # Fixed length columns are handled before variable length. If this is a variable length column add it to - # mapping and continue - if not column.column_flags.fixed_length: - relative_records_column_map[i] = column - continue - - self._parse_fixed_length_data(record, column, null_table) - if relative_records_column_map: - relative_records_column_map = dict(sorted(relative_records_column_map.items())) - metadata = self._parse_dynamic_length_records_metadata(reverse_record, original_record, - null_table_len) - if not metadata: - return - if metadata.variable_length_field_offsets: - self._parse_dynamic_length_data(original_record, metadata, relative_records_column_map, null_table) + # if all rows deleted + if not self.parsed_table: + return self.create_empty_table() - def _parse_fixed_length_data(self, original_record, column, null_table): - """ - Parse fixed-length data from record - :param original_record: unmodified record - :param column: column this data belongs to - :param null_table: null table of the row - """ - column_name = column.col_name_str - # The null table indicates null values in the row. - # The only exception is BOOL fields which are encoded in the null table - has_value = True - if column.column_id > len(null_table): - LOGGER.warning("Invalid null table. Bool values may be wrong, deleted values may be shown in the db.") - if column.type == TYPE_BOOLEAN: - has_value = None - else: - has_value = null_table[column.column_id] - # Boolean fields are encoded in the null table - if column.type == TYPE_BOOLEAN: - parsed_type = has_value - else: - if column.fixed_offset > len(original_record): - LOGGER.error(f"Column offset is bigger than the length of the record {column.fixed_offset}") - return - record = original_record[column.fixed_offset:] - parsed_type = parse_type(column.type, record, version=self.version, props=column.extra_props or None) - if not has_value: - self.parsed_table[column_name].append(None) - return - self.parsed_table[column_name].append(parsed_type) + # reorder columns in output + columns_sorted = OrderedDict(sorted(self.columns.items(), key=lambda t: t[0])) + reordered = OrderedDict( + (col.col_name_str, self.parsed_table[col.col_name_str]) + for _, col in columns_sorted.items() + ) + self.parsed_table = reordered + return self.parsed_table - def _parse_dynamic_length_records_metadata(self, reverse_record, original_record, null_table_length): - """ - parse the metadata of relative records. The metadata used to parse relative records is found at the end of the - record so reverse_record is used for parsing from the bottom up. - :param reverse_record: original record in reverse - :param original_record: unmodified record - :param null_table_length: - :return: parsed relative record metadata - """ - if self.version > 3: - reverse_record = reverse_record[null_table_length:] - return parse_relative_object_metadata_struct(reverse_record, version=self.version) - # Parse relative metadata. - # Metadata is from the end of the record(reverse_record is used here) - variable_length_jump_table_cnt = (len(original_record) - 1) // 256 - reverse_record = reverse_record[null_table_length:] - try: - relative_record_metadata = parse_relative_object_metadata_struct(reverse_record, - variable_length_jump_table_cnt, - self.version) - # relative_record_metadata = RELATIVE_OBJS.parse(reverse_record) - # we use this offset in original_record so we have to update the length with the null_tables - relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + null_table_length - except ConstructError: - relative_record_metadata = None - LOGGER.error("Failed parsing record") - - if relative_record_metadata and \ - relative_record_metadata.variable_length_field_count != self.table_header.variable_columns: - - # best effort - try to find variable column count in the record and parse from there - # this is limited to the 10 first bytes to reduce false positives. - # most of the time iv'e seen this there was an extra DWORD before the actual metadata - metadata_start = reverse_record.find(bytes([self.table_header.variable_columns])) - if metadata_start != -1 and metadata_start < 10: - reverse_record = reverse_record[metadata_start:] - try: - relative_record_metadata = parse_relative_object_metadata_struct(reverse_record, - variable_length_jump_table_cnt, - self.version) - except ConstructError: - LOGGER.error(f"Failed to parse record metadata: {original_record}") - relative_record_metadata.relative_metadata_end = relative_record_metadata.relative_metadata_end + \ - metadata_start - else: - LOGGER.warning( - f"Record did not parse correctly. Number of columns: {self.table_header.variable_columns}" - f" number of parsed columns: {relative_record_metadata.variable_length_field_count}") - return None - return relative_record_metadata - - def _parse_dynamic_length_data(self, original_record, relative_record_metadata, - relative_records_column_map, null_table): - """ - Parse dynamic (non fixed length) records from row - :param original_record: full unmodified record - :param relative_record_metadata: parsed record metadata - :param relative_records_column_map: relative records colum mapping {index: column} - :param null_table: list indicating which columns have null value - """ - relative_offsets = relative_record_metadata.variable_length_field_offsets - jump_table_addition = 0 - for i, column_index in enumerate(relative_records_column_map): - column = relative_records_column_map[column_index] - col_name = column.col_name_str - has_value = True - if column.column_id > len(null_table): - LOGGER.warning("Invalid null table. null values may be shown in the db.") - else: - has_value = null_table[column.column_id] - if not has_value: - self.parsed_table[col_name].append(None) - continue - if self.version == 3: - if i in relative_record_metadata.variable_length_jump_table: - jump_table_addition += 0x100 - rel_start = relative_offsets[i] - # If this is the last one use var_len_count as end offset - if i + 1 == len(relative_offsets): - rel_end = relative_record_metadata.var_len_count - else: - rel_end = relative_offsets[i + 1] - # if rel_start and rel_end are the same there is no data in this slot - if rel_start == rel_end: - self.parsed_table[col_name].append("") - continue + def _get_usage_map(self,page_num,row_num): + + ##Need to define a version config + OFFSET_MASK = 0x1FFF + INVALID_PAGE_NUMBER = -1 + MAP_TYPE_INLINE = 0 + MAP_TYPE_REFERENCE = 1 + + #get page containing usage map info + table_buffer = self._all_pages[page_num*self.page_size] + + #prepare offsets to pick relevant info from table buffer + row_start_offset = self.version.OFFSET_ROW_START + (self.version.SIZE_ROW_LOCATION * row_num) + row_end_offset = self.version.OFFSET_ROW_START + (self.version.SIZE_ROW_LOCATION * (row_num - 1)) + + #find row start + row_start = parse_buffer_custom(table_buffer,row_start_offset,'Int16ul') & OFFSET_MASK + + #find row end + row_end = self.page_size if row_num == 0 else parse_buffer_custom(table_buffer,row_end_offset,'Int16ul') & OFFSET_MASK + + #limit buffer + table_buffer = table_buffer[:row_end] + + #map type + map_type = parse_buffer_custom(table_buffer,row_start,'Int8ul') + + #offset start + um_start_offset = row_start + self.version.OFFSET_USAGE_MAP_START + + if map_type == MAP_TYPE_INLINE: + ## Usage map whose map is written inline in the same page. For Jet4, this + ## type of map can usually contains a maximum of 512 pages. Free space maps + ## are always inline, used space maps may be inline or reference. It has a + ## start page, which all page numbers in its map are calculated as starting + ## from. + + ##inline handler processing + max_inline_pages = (row_end - um_start_offset) * 8 + start_page = parse_buffer_custom(table_buffer,row_start+1,'Int32ul') + end_page = start_page + max_inline_pages + + ##process page array + filtered_buffer = table_buffer[um_start_offset:] + filtered_buffer_size = len(filtered_buffer) + page_numbers = [] + byteCount = 0 + + while byteCount < filtered_buffer_size: + b = filtered_buffer[byteCount:byteCount+1] + if b != b'\x00': + for i in range(8): + if ((int.from_bytes(b,'big') & (1 << i)) != 0): + pageNumberOffset = (byteCount * 8 + i) + pageNumber = (start_page + pageNumberOffset) if (pageNumberOffset >= 0) else INVALID_PAGE_NUMBER + if pageNumber < start_page or pageNumber > end_page: + #invalid page number + break + page_numbers.append(pageNumber) + byteCount += 1 + + return page_numbers + + elif map_type == MAP_TYPE_REFERENCE: + ## Usage map whose map is written across one or more entire separate pages + ## of page type USAGE_MAP. For Jet4, this type of map can contain 32736 + ## pages per reference page, and a maximum of 17 reference map pages for a + ## total maximum of 556512 pages (2 GB). + + ##reference handler processing + + max_pages_per_usage_map_page = ((self.version.page_size - self.version.OFFSET_USAGE_MAP_PAGE_DATA) * 8) + num_usage_pages = int((row_end - row_start - 1) / 4) + um_start_offset = self.version.OFFSET_USAGE_MAP_START + + start_page = 0 + end_page = (num_usage_pages * max_pages_per_usage_map_page) + + # there is no "start page" for a reference usage map, so we get an + # extra page reference on top of the number of page references that fit + # in the table + page_numbers = [] + for i in range(num_usage_pages): + map_page_pointer_offset = row_start + self.version.OFFSET_REFERENCE_MAP_PAGE_NUMBERS + (i * 4) + map_page_num = parse_buffer_custom(table_buffer,map_page_pointer_offset,'Int32ul') + if map_page_num > 0: + map_page_buffer = self._all_pages[map_page_num*self.version.page_size] + page_type = map_page_buffer[0] + if page_type != PageTypes.USAGE_MAP: + LOGGER.error(f"Looking for usage map at page {map_page_num}, but page type is {page_type}") + return + filtered_buffer = map_page_buffer[self.version.OFFSET_USAGE_MAP_PAGE_DATA:] + + #Process map + buffer_start_page = (max_pages_per_usage_map_page * i) + + filtered_buffer_size = len(filtered_buffer) + + byteCount = 0 + + while byteCount < filtered_buffer_size: + b = filtered_buffer[byteCount:byteCount+1] + if b != b'\x00': + for i in range(8): + if ((int.from_bytes(b,'big') & (1 << i)) != 0): + pageNumberOffset = (byteCount * 8 + i) + buffer_start_page + pageNumber = (start_page + pageNumberOffset) if (pageNumberOffset >= 0) else INVALID_PAGE_NUMBER + if pageNumber < start_page or pageNumber > end_page: + #invalid page number + break + page_numbers.append(pageNumber) + byteCount += 1 + + return page_numbers - relative_obj_data = original_record[rel_start + jump_table_addition: rel_end + jump_table_addition] - # Parse types that require column data here, call parse_type on all other types - if column.type == TYPE_MEMO: - try: - parsed_type = self._parse_memo(relative_obj_data) - except ConstructError: - LOGGER.warning("Failed to parse memo field. Using data as bytes") - parsed_type = relative_obj_data - elif column.type == TYPE_OLE: - try: - parsed_type = self._parse_memo(relative_obj_data, return_raw=True) - except ConstructError: - LOGGER.warning("Failed to parse OLE field. Using data as bytes") - parsed_type = relative_obj_data - elif column.type == TYPE_96_bit_17_BYTES: - if len(relative_obj_data) != 17: - LOGGER.warning(f"Relative numeric field has invalid length {len(relative_obj_data)}, expected 17") - parsed_type = relative_obj_data - else: - # Get scale or None - scale = column.get('various', {}).get('scale', 6) - parsed_type = numeric_to_string(relative_obj_data, scale) - else: - parsed_type = parse_type(column.type, relative_obj_data, len(relative_obj_data), version=self.version) - self.parsed_table[col_name].append(parsed_type) def _get_table_columns(self): """ @@ -465,7 +408,18 @@ def _get_table_columns(self): version=self.version, ) + + #add usage maps from table referenced by table head + #The catalog level linked pages array can be out of date following deletes. so use table header info to find accurate usage maps. + owned_pages_map = self._get_usage_map(table_header.row_page_map_page_number,table_header.row_page_map_row_number) + self.table.owned_pages = {pn: self._all_pages[pn * self.page_size] for pn in owned_pages_map} + + free_space_pages_map = self._get_usage_map(table_header.free_space_page_map_page_number,table_header.free_space_page_map_row_number) + self.table.free_space_pages = {pn: self._all_pages[pn * self.page_size] for pn in free_space_pages_map} + + # Merge Data back to table_header + table_header['index'] = parsed_data['real_index'] table_header['column'] = parsed_data['column'] table_header['column_names'] = parsed_data['column_names'] table_header['real_index_2'] = parsed_data['real_index_2'] @@ -489,11 +443,14 @@ def _get_table_columns(self): # create a dict of index to column to make it easier to access. offset is used to make this zero based offset = min(x.column_index for x in columns) column_dict = {x.column_index - offset: x for x in columns} + # If column index is not unique try best effort if len(column_dict) != len(columns): # create a dict of id to column to make it easier to access column_dict = {x.column_id: x for x in columns} + column_dict = OrderedDict(sorted(column_dict.items())) + # Add the extra properties relevant for the column if self.props: for i, col in column_dict.items(): @@ -545,47 +502,176 @@ def _parse_memo(self, relative_obj_data, return_raw=False): else: LOGGER.debug("LVAL type 2") rec_data = self._get_overflow_record(parsed_memo.record_pointer) - next_page = struct.unpack("I", rec_data[:4])[0] - # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data. - # Concat the data until we get a 0 next_page. - memo_data = b"" - while next_page: - memo_data += rec_data[4:] - rec_data = self._get_overflow_record(next_page) + #adding a workaround until lval type 2 issue resolved. + if rec_data: next_page = struct.unpack("I", rec_data[:4])[0] - memo_data += rec_data[4:] + # LVAL2 has data over multiple pages. The first 4 bytes of the page are the next record, then that data. + # Concat the data until we get a 0 next_page. + memo_data = b"" + while next_page: + memo_data += rec_data[4:] + rec_data = self._get_overflow_record(next_page) + next_page = struct.unpack("I", rec_data[:4])[0] + memo_data += rec_data[4:] + else: + memo_data = b"" if memo_data: if return_raw: return memo_data parsed_type = parse_type(memo_type, memo_data, len(memo_data), version=self.version) return parsed_type - def _get_overflow_record(self, record_pointer): + def _get_overflow_record(self, record_pointer: int): + slot_index = record_pointer & 0xFF + page_num = record_pointer >> 8 + page_data = self._all_pages.get(page_num * self.page_size) + if page_data is None: + LOGGER.warning(f"Missing overflow page for pointer {record_pointer}") + return None + + parsed_page = parse_data_page_header(page_data, version=self.version) + raw_loc = parsed_page.record_offsets[slot_index] + + # ─── SLICE OUT THE TRUE ROW ───────────────────────────────────── + start = self._clean_loc(raw_loc) + if slot_index == 0: + end = self.page_size + else: + end = self._clean_loc(parsed_page.record_offsets[slot_index - 1]) + + return page_data[start:end] + + + # starting point for a iterating parser to enable outputs to be streamed to avoid memory overflows. + def _parse_row(self, record): """ - Get the actual record from a record pointer - :param record_pointer: - :return: record + Reads the row data from the given row buffer. Leaves limit unchanged. + :param record: the current row data + :return: """ - record_offset = record_pointer & 0xff - page_num = record_pointer >> 8 - record_page = self._data_pages.get(page_num * self.page_size) - if not record_page: - LOGGER.warning(f"Could not find overflow record data page overflow pointer: {record_pointer}") - return - parsed_data = parse_data_page_header(record_page, version=self.version) - if record_offset > len(parsed_data.record_offsets): - LOGGER.warning("Failed parsing overflow record offset") - return - start = parsed_data.record_offsets[record_offset] - if start & 0x8000: - start = start & 0xfff - else: - LOGGER.debug(f"Overflow record flag is not present {start}") - if record_offset == 0: - record = record_page[start:] + + original_record = record + + # Records contain null bitmaps for columns. The number of bitmaps is the number of columns / 8 rounded up + null_table_len = (self.table_header.column_count + 7) // 8 + if null_table_len and null_table_len < len(original_record): + null_table = record[-null_table_len:] + # Turn bitmap to a list of True False values + null_table = [((null_table[i // 8]) & (1 << (i % 8))) == 0 for i in range(len(null_table) * 8)] else: - end = parsed_data.record_offsets[record_offset - 1] - if end & 0x8000 and (end & 0xff != 0): - end = end & 0xfff - record = record_page[start: end] - return record + LOGGER.error(f"Failed to parse null table column count {self.table_header.column_count}") + return + + + if self.version.SIZE_ROW_VAR_COL_OFFSET != 2: + + jumpColOffsets = self._readJumpTableVarColOffsets(original_record,0,null_table_len) + + + for i, column in self.columns.items(): + + #get column name + column_name = column.col_name_str + + #Check nullmask + isNull = True if column.column_id >= self.table_header.column_count else null_table[column.column_id] + + # Boolean fields are encoded in the null table + if column.type == TYPE_BOOLEAN: + self.parsed_table[column_name].append(isNull) + continue + + # remaining columns marked as null in nullmask are recorded as None + if isNull: + self.parsed_table[column_name].append(None) + continue + + # prep variables for column parsing + rowStart = 0 + colDataPos = 0 + colDataLen = 0 + colDataType = column.type + + #if fixed length + if column.column_flags.fixed_length: + + #identify fixed length variables + dataStart = rowStart + self.version.OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET + colDataPos = dataStart + column.fixed_offset + colDataLen = column.length + + #if variable length + else: + + varDataStart = None + varDataEnd = None + + if self.version.SIZE_ROW_VAR_COL_OFFSET == 2: + + #read simple var length value + varColumnsOffsetPos = (len(original_record) - null_table_len - 4) - (column.variable_column_number * 2) + + varDataStart = parse_buffer_custom(original_record,varColumnsOffsetPos,'Int16ul') + varDataEnd = parse_buffer_custom(original_record,varColumnsOffsetPos-2,'Int16ul') + + else: + + #read jump-table based var length values + varDataStart = jumpColOffsets[column.variable_column_number] + varDataEnd = jumpColOffsets[column.variable_column_number + 1] + + #prepare variable length get + colDataPos = rowStart + varDataStart + colDataLen = varDataEnd - varDataStart + + if colDataLen <= 0: + # empty string/zero‑length + self.parsed_table[column_name].append("" if colDataType==TYPE_TEXT else b"") + continue + + data = original_record[colDataPos:colDataPos+colDataLen] + + # dispatch on your column.type + if colDataType in (TYPE_MEMO,TYPE_OLE): + value = self._parse_memo(data, return_raw=(colDataType == TYPE_OLE)) + elif colDataType == TYPE_96_BIT_17_BYTES: + scale = column.extra_props.get("scale", column.various.get("scale",6)) + value = numeric_to_string(data, scale) + else: + # fallback to gerneral parse_type + value = parse_type(colDataType, data, colDataLen, version=self.version, props=column.extra_props or None) + + self.parsed_table[column_name].append(value) + + + + def _readJumpTableVarColOffsets(self, buffer, rowStart, nullMaskSize): + + # calculate offsets using jump-table info + rowEnd = rowStart + len(buffer) -1 + numVarCols = buffer[rowEnd - nullMaskSize] + + varColOffsets = [0] * (numVarCols + 1) + + rowLen = rowEnd - rowStart + 1 + numJumps = (rowLen - 1) // MAX_BYTE + colOffset = rowEnd - nullMaskSize - numJumps - 1 + + # if last jump is a dummy value, ignore it + if ((colOffset - rowStart - numVarCols) // MAX_BYTE) < numJumps: + numJumps -= 1 + + jumpsUsed = 0 + # Fill in each of the varColOffsets entries + for i in range(numVarCols + 1): + # Skip ahead in the jump table as long as the next jump byte equals i + while (jumpsUsed < numJumps and + buffer[rowEnd - nullMaskSize - jumpsUsed - 1] == i): + jumps_used += 1 + + # The low‐order part of the offset is at col_offset - i + low = buffer[colOffset - i] + # The high‐order is jumpsUsed * MAX_BYTE + varColOffsets[i] = low + (jumpsUsed * MAX_BYTE) + + return varColOffsets \ No newline at end of file diff --git a/access_parser/jetformat.py b/access_parser/jetformat.py new file mode 100644 index 0000000..ab0d30e --- /dev/null +++ b/access_parser/jetformat.py @@ -0,0 +1,671 @@ + + +import enum +import sys +import locale +from typing import ClassVar, Optional, Set, Type + + +class CodecType(enum.Enum): + NONE = enum.auto() + JET = enum.auto() + MSISAM = enum.auto() + OFFICE = enum.auto() + + +class DataType(enum.Enum): + BOOLEAN = enum.auto() + BYTE = enum.auto() + INT = enum.auto() + LONG = enum.auto() + FLOAT = enum.auto() + DOUBLE = enum.auto() + GUID = enum.auto() + SHORT_DATE_TIME = enum.auto() + MONEY = enum.auto() + NUMERIC = enum.auto() + TEXT = enum.auto() + MEMO = enum.auto() + BIG_INT = enum.auto() + EXT_DATE_TIME = enum.auto() + COMPLEX_TYPE = enum.auto() + +class PageTypes: + INVALID = 0 + DATA = 1 + TABLE_DEF = 2 + INDEX_NODE = 3 + INDEX_LEAF = 4 + USAGE_MAP = 5 + + +class BaseFormat: + """ + Base class declaring every Jet-format constant as a class attribute. + Subclasses simply override the ones that change. + """ + + # — Static JetFormat constants — + MAX_RECORD_SIZE: ClassVar[int] = 1900 + TEXT_FIELD_UNIT_SIZE: ClassVar[int] = 2 + TEXT_FIELD_MAX_LENGTH:ClassVar[int] = 255 * TEXT_FIELD_UNIT_SIZE + + PROPERTY_MAP_TYPES: ClassVar[list[bytes]] = [ + b"MR2\x00", # access 2000+ + b"KKD\x00" # access 97 + ] + + # the raw version byte in the header + VERSION_CODE: ClassVar[Optional[int]] = None + # numeric mapping to enable compatibility + VERSION_NUMBER: ClassVar[Optional[int]] = None + + # — Identity & capabilities — + name: ClassVar[str] = "UNKNOWN" + read_only: ClassVar[bool] = False + indexes_supported: ClassVar[bool] = False + codec_type: ClassVar[CodecType] = CodecType.NONE + page_size: ClassVar[int] = 0 + max_database_size: ClassVar[int] = 0 + + unsupported_data_types: ClassVar[Set[DataType]] = set() + unsupported_calc_types: ClassVar[Set[DataType]] = set() + + # — Header parsing — + OFFSET_VERSION: ClassVar[int] = 20 + HEADER_LENGTH: ClassVar[int] = 21 + OFFSET_ENGINE_NAME: ClassVar[int] = 0x04 + LENGTH_ENGINE_NAME: ClassVar[int] = 0x0F + MSISAM_ENGINE: ClassVar[bytes] = b"MSISAM Database" + + BASE_HEADER_MASK: ClassVar[bytes] = bytes([ + 0xB5,0x6F,0x03,0x62,0x61,0x08,0xC2,0x55, 0xEB,0xA9,0x67,0x72,0x43,0x3F,0x00,0x9C, + 0x7A,0x9F,0x90,0xFF,0x80,0x9A,0x31,0xC5, 0x79,0xBA,0xED,0x30,0xBC,0xDF,0xCC,0x9D, + 0x63,0xD9,0xE4,0xC3,0x7B,0x42,0xFB,0x8A, 0xBC,0x4E,0x86,0xFB,0xEC,0x37,0x5D,0x44, + 0x9C,0xFA,0xC6,0x5E,0x28,0xE6,0x13,0xB6, 0x8A,0x60,0x54,0x94,0x7B,0x36,0xF5,0x72, + 0xDF,0xB1,0x77,0xF4,0x13,0x43,0xCF,0xAF, 0xB1,0x33,0x34,0x61,0x79,0x5B,0x92,0xB5, + 0x7C,0x2A,0x05,0xF1,0x7C,0x99,0x01,0x1B, 0x98,0xFD,0x12,0x4F,0x4A,0x94,0x6C,0x3E, + 0x60,0x26,0x5F,0x95,0xF8,0xD0,0x89,0x24, 0x85,0x67,0xC6,0x1F,0x27,0x44,0xD2,0xEE, + 0xCF,0x65,0xED,0xFF,0x07,0xC7,0x46,0xA1, 0x78,0x16,0x0C,0xED,0xE9,0x2D,0x62,0xD4 + ]) + + # — All possible header‐level offsets & sizes (defaults) — + OFFSET_MASKED_HEADER: ClassVar[Optional[int]] = None + HEADER_MASK: ClassVar[Optional[bytes]] = None + OFFSET_HEADER_DATE: ClassVar[Optional[int]] = None + OFFSET_PASSWORD: ClassVar[Optional[int]] = None + SIZE_PASSWORD: ClassVar[Optional[int]] = None + OFFSET_SORT_ORDER: ClassVar[Optional[int]] = None + SIZE_SORT_ORDER: ClassVar[Optional[int]] = None + OFFSET_CODE_PAGE: ClassVar[Optional[int]] = None + OFFSET_ENCODING_KEY: ClassVar[Optional[int]] = None + + # — All possible data‐page / table / index constants (defaults) — + MAX_ROW_SIZE: ClassVar[Optional[int]] = None + DATA_PAGE_INITIAL_FREE_SPACE: ClassVar[Optional[int]] = None + + OFFSET_NEXT_TABLE_DEF_PAGE: ClassVar[Optional[int]] = None + OFFSET_NUM_ROWS: ClassVar[Optional[int]] = None + OFFSET_NEXT_AUTO_NUMBER: ClassVar[Optional[int]] = None + OFFSET_NEXT_COMPLEX_AUTO_NUMBER: ClassVar[Optional[int]] = None + + OFFSET_TABLE_TYPE: ClassVar[Optional[int]] = None + OFFSET_MAX_COLS: ClassVar[Optional[int]] = None + OFFSET_NUM_VAR_COLS: ClassVar[Optional[int]] = None + OFFSET_NUM_COLS: ClassVar[Optional[int]] = None + + OFFSET_NUM_INDEX_SLOTS: ClassVar[Optional[int]] = None + OFFSET_NUM_INDEXES: ClassVar[Optional[int]] = None + OFFSET_OWNED_PAGES: ClassVar[Optional[int]] = None + OFFSET_FREE_SPACE_PAGES: ClassVar[Optional[int]] = None + OFFSET_INDEX_DEF_BLOCK: ClassVar[Optional[int]] = None + + SIZE_INDEX_COLUMN_BLOCK: ClassVar[Optional[int]] = None + SIZE_INDEX_INFO_BLOCK: ClassVar[Optional[int]] = None + + OFFSET_COLUMN_TYPE: ClassVar[Optional[int]] = None + OFFSET_COLUMN_NUMBER: ClassVar[Optional[int]] = None + OFFSET_COLUMN_PRECISION: ClassVar[Optional[int]] = None + OFFSET_COLUMN_SCALE: ClassVar[Optional[int]] = None + OFFSET_COLUMN_SORT_ORDER: ClassVar[Optional[int]] = None + OFFSET_COLUMN_CODE_PAGE: ClassVar[Optional[int]] = None + OFFSET_COLUMN_COMPLEX_ID: ClassVar[Optional[int]] = None + OFFSET_COLUMN_FLAGS: ClassVar[Optional[int]] = None + OFFSET_COLUMN_EXT_FLAGS: ClassVar[Optional[int]] = None + OFFSET_COLUMN_LENGTH: ClassVar[Optional[int]] = None + OFFSET_COLUMN_VARIABLE_TABLE_INDEX: ClassVar[Optional[int]] = None + OFFSET_COLUMN_FIXED_DATA_OFFSET: ClassVar[Optional[int]] = None + OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET: ClassVar[Optional[int]] = None + + OFFSET_TABLE_DEF_LOCATION: ClassVar[Optional[int]] = None + OFFSET_ROW_START: ClassVar[Optional[int]] = None + OFFSET_USAGE_MAP_START: ClassVar[Optional[int]] = None + OFFSET_USAGE_MAP_PAGE_DATA: ClassVar[Optional[int]] = None + OFFSET_REFERENCE_MAP_PAGE_NUMBERS: ClassVar[Optional[int]] = None + + OFFSET_FREE_SPACE: ClassVar[Optional[int]] = None + OFFSET_NUM_ROWS_ON_DATA_PAGE: ClassVar[Optional[int]] = None + MAX_NUM_ROWS_ON_DATA_PAGE: ClassVar[Optional[int]] = None + + OFFSET_INDEX_COMPRESSED_BYTE_COUNT: ClassVar[Optional[int]] = None + OFFSET_INDEX_ENTRY_MASK: ClassVar[Optional[int]] = None + OFFSET_PREV_INDEX_PAGE: ClassVar[Optional[int]] = None + OFFSET_NEXT_INDEX_PAGE: ClassVar[Optional[int]] = None + OFFSET_CHILD_TAIL_INDEX_PAGE: ClassVar[Optional[int]] = None + + SIZE_INDEX_DEFINITION: ClassVar[Optional[int]] = None + SIZE_COLUMN_HEADER: ClassVar[Optional[int]] = None + SIZE_ROW_LOCATION: ClassVar[Optional[int]] = None + SIZE_LONG_VALUE_DEF: ClassVar[Optional[int]] = None + + MAX_INLINE_LONG_VALUE_SIZE: ClassVar[Optional[int]] = None + MAX_LONG_VALUE_ROW_SIZE: ClassVar[Optional[int]] = None + MAX_COMPRESSED_UNICODE_SIZE: ClassVar[Optional[int]] = None + + SIZE_TDEF_HEADER: ClassVar[Optional[int]] = None + SIZE_TDEF_TRAILER: ClassVar[Optional[int]] = None + SIZE_COLUMN_DEF_BLOCK: ClassVar[Optional[int]] = None + SIZE_INDEX_ENTRY_MASK: ClassVar[Optional[int]] = None + + SKIP_BEFORE_INDEX_FLAGS: ClassVar[Optional[int]] = None + SKIP_AFTER_INDEX_FLAGS: ClassVar[Optional[int]] = None + SKIP_BEFORE_INDEX_SLOT: ClassVar[Optional[int]] = None + SKIP_AFTER_INDEX_SLOT: ClassVar[Optional[int]] = None + SKIP_BEFORE_INDEX: ClassVar[Optional[int]] = None + + SIZE_NAME_LENGTH: ClassVar[Optional[int]] = None + SIZE_ROW_COLUMN_COUNT: ClassVar[Optional[int]] = None + SIZE_ROW_VAR_COL_OFFSET: ClassVar[Optional[int]] = None + + USAGE_MAP_TABLE_BYTE_LENGTH: ClassVar[Optional[int]] = None + + MAX_COLUMNS_PER_TABLE: ClassVar[Optional[int]] = None + MAX_INDEXES_PER_TABLE: ClassVar[Optional[int]] = None + MAX_TABLE_NAME_LENGTH: ClassVar[Optional[int]] = None + MAX_COLUMN_NAME_LENGTH: ClassVar[Optional[int]] = None + MAX_INDEX_NAME_LENGTH: ClassVar[Optional[int]] = None + + LEGACY_NUMERIC_INDEXES: ClassVar[Optional[bool]] = None + CHARSET: ClassVar[Optional[str]] = None + DEFAULT_SORT_ORDER: ClassVar[Optional[str]] = None + PROPERTY_MAP_TYPE: ClassVar[Optional[bytes]] = None + SIZE_TEXT_FIELD_UNIT: ClassVar[Optional[int]] = None + + ## compatibility functions to enable "version" to continue to be used as it was before: + def __str__(self): + """Return the VERSION_NUMBER when the object is converted to a string.""" + return str(self.VERSION_NUMBER) + + def __repr__(self): + """Return a more detailed representation for debugging.""" + return f"{self.__class__.__name__}(version={self.VERSION_NUMBER})" + + def __eq__(self, other): + """Enable direct comparison with numbers and other BaseFormat objects.""" + if isinstance(other, (int, float)): + return self.VERSION_NUMBER == other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER == other.VERSION_NUMBER + return NotImplemented + + def __int__(self): + """Allow conversion to integer.""" + return self.VERSION_NUMBER + + # Additional comparison methods for completeness + def __lt__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER < other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER < other.VERSION_NUMBER + return NotImplemented + + def __gt__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER > other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER > other.VERSION_NUMBER + return NotImplemented + + def __le__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER <= other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER <= other.VERSION_NUMBER + return NotImplemented + + def __ge__(self, other): + if isinstance(other, (int, float)): + return self.VERSION_NUMBER >= other + elif isinstance(other, BaseFormat): + return self.VERSION_NUMBER >= other.VERSION_NUMBER + return NotImplemented + + @classmethod + def is_supported_data_type(cls, dt: DataType) -> bool: + return dt not in cls.unsupported_data_types + + @classmethod + def is_supported_calc_type(cls, dt: DataType) -> bool: + return dt not in cls.unsupported_calc_types + + @classmethod + def _all_subclasses(cls): + """ + Recursively yield all subclasses of this class. + """ + for sub in cls.__subclasses__(): + yield sub + yield from sub._all_subclasses() + + @classmethod + def get_format(cls, path: str) -> "BaseFormat": + hdr = open(path, "rb").read(cls.HEADER_LENGTH) + if len(hdr) < cls.HEADER_LENGTH: + raise IOError(f"Not a Jet database: {path!r}") + + # 1) try the raw version byte first (Jet4+) + raw_ver = hdr[cls.OFFSET_VERSION] + for sub in cls._all_subclasses(): + if getattr(sub, "VERSION_CODE", None) == raw_ver: + return sub() + + # 2) attempt unmask for Jet3 only + masked_ver = raw_ver ^ cls.BASE_HEADER_MASK[cls.OFFSET_VERSION] + from .jetformat import Jet3Format + if masked_ver == Jet3Format.VERSION_CODE: + return Jet3Format() + + # 3) fallback MSISAM by engine‐name + eng = hdr[cls.OFFSET_ENGINE_NAME: + cls.OFFSET_ENGINE_NAME + cls.LENGTH_ENGINE_NAME] + for sub in cls._all_subclasses(): + prefix = getattr(sub, "ENGINE_NAME_PREFIX", None) + if prefix and eng.startswith(prefix): + return sub() + + raise IOError(f"Unknown Jet version byte: raw=0x{raw_ver:02X}, masked=0x{masked_ver:02X}") + + + @classmethod + def get_format_from_header(cls, buf: bytes) -> "BaseFormat": + if len(buf) < cls.HEADER_LENGTH: + raise ValueError(f"Header buffer too small ({len(buf)} < {cls.HEADER_LENGTH})") + + raw_ver = buf[cls.OFFSET_VERSION] + + # 1) check raw byte → Jet4, Jet12, Jet14, Jet16, Jet17 + for sub in cls._all_subclasses(): + if getattr(sub, "VERSION_CODE", None) == raw_ver: + return sub() + + # 2) if raw byte wasn’t a match, unmask *just* for Jet3 detection + # (only Jet3 files use this mask) + masked_ver = raw_ver ^ cls.BASE_HEADER_MASK[cls.OFFSET_VERSION] + from .jetformat import Jet3Format + if masked_ver == Jet3Format.VERSION_CODE: + return Jet3Format() + + # 3) fallback: MSISAM + eng = buf[cls.OFFSET_ENGINE_NAME:cls.OFFSET_ENGINE_NAME + cls.LENGTH_ENGINE_NAME] + for sub in cls._all_subclasses(): + prefix = getattr(sub, "ENGINE_NAME_PREFIX", None) + if prefix and eng.startswith(prefix): + return sub() + + raise ValueError(f"Unknown Jet version byte: raw=0x{raw_ver:02X}, masked=0x{masked_ver:02X}") + + +# ---------------------------------------------------------------------- +# Jet3Format subclass: overrides *only* those attrs that Jet 3 needs +# ---------------------------------------------------------------------- +class Jet3Format(BaseFormat): + VERSION_CODE = 0x00 + VERSION_NUMBER = 3 + ENGINE_NAME_PREFIX = None + + # identity & capabilities + name = "3" + read_only = True + indexes_supported = True + codec_type = CodecType.JET + page_size = 2048 + max_database_size = 1 * 1024**3 + + unsupported_data_types = {DataType.COMPLEX_TYPE} + unsupported_calc_types = set() + + # header‐level + OFFSET_MASKED_HEADER = 24 + HEADER_MASK = BaseFormat.BASE_HEADER_MASK[:-2] + OFFSET_HEADER_DATE = -1 + OFFSET_PASSWORD = 66 + SIZE_PASSWORD = 20 + OFFSET_SORT_ORDER = 58 + SIZE_SORT_ORDER = 2 + OFFSET_CODE_PAGE = 60 + OFFSET_ENCODING_KEY = 62 + + # page/table/index + MAX_ROW_SIZE = 2012 + DATA_PAGE_INITIAL_FREE_SPACE = page_size - 14 + + OFFSET_NEXT_TABLE_DEF_PAGE = 4 + OFFSET_NUM_ROWS = 12 + OFFSET_NEXT_AUTO_NUMBER = 20 + OFFSET_NEXT_COMPLEX_AUTO_NUMBER = -1 + + OFFSET_TABLE_TYPE = 20 + OFFSET_MAX_COLS = 21 + OFFSET_NUM_VAR_COLS = 23 + OFFSET_NUM_COLS = 25 + + OFFSET_NUM_INDEX_SLOTS = 27 + OFFSET_NUM_INDEXES = 31 + OFFSET_OWNED_PAGES = 35 + OFFSET_FREE_SPACE_PAGES = 39 + OFFSET_INDEX_DEF_BLOCK = 43 + + SIZE_INDEX_COLUMN_BLOCK = 39 + SIZE_INDEX_INFO_BLOCK = 20 + + OFFSET_COLUMN_TYPE = 0 + OFFSET_COLUMN_NUMBER = 1 + OFFSET_COLUMN_PRECISION = 11 + OFFSET_COLUMN_SCALE = 12 + OFFSET_COLUMN_SORT_ORDER = 9 + OFFSET_COLUMN_CODE_PAGE = 11 + OFFSET_COLUMN_COMPLEX_ID = -1 + OFFSET_COLUMN_FLAGS = 13 + OFFSET_COLUMN_EXT_FLAGS = -1 + OFFSET_COLUMN_LENGTH = 16 + OFFSET_COLUMN_VARIABLE_TABLE_INDEX = 3 + OFFSET_COLUMN_FIXED_DATA_OFFSET = 14 + OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET= 1 + + OFFSET_TABLE_DEF_LOCATION = 4 + OFFSET_ROW_START = 10 + OFFSET_USAGE_MAP_START = 5 + OFFSET_USAGE_MAP_PAGE_DATA = 4 + OFFSET_REFERENCE_MAP_PAGE_NUMBERS = 1 + + OFFSET_FREE_SPACE = 2 + OFFSET_NUM_ROWS_ON_DATA_PAGE = 8 + MAX_NUM_ROWS_ON_DATA_PAGE = 255 + + OFFSET_INDEX_COMPRESSED_BYTE_COUNT = 20 + OFFSET_INDEX_ENTRY_MASK = 22 + OFFSET_PREV_INDEX_PAGE = 8 + OFFSET_NEXT_INDEX_PAGE = 12 + OFFSET_CHILD_TAIL_INDEX_PAGE = 16 + + SIZE_INDEX_DEFINITION = 8 + SIZE_COLUMN_HEADER = 18 + SIZE_ROW_LOCATION = 2 + SIZE_LONG_VALUE_DEF = 12 + MAX_INLINE_LONG_VALUE_SIZE = 64 + MAX_LONG_VALUE_ROW_SIZE = 2032 + MAX_COMPRESSED_UNICODE_SIZE = 1024 + + SIZE_TDEF_HEADER = 43 + SIZE_TDEF_TRAILER = 2 + SIZE_COLUMN_DEF_BLOCK = 25 + SIZE_INDEX_ENTRY_MASK = 226 + + SKIP_BEFORE_INDEX_FLAGS = 0 + SKIP_AFTER_INDEX_FLAGS = 0 + SKIP_BEFORE_INDEX_SLOT = 0 + SKIP_AFTER_INDEX_SLOT = 0 + SKIP_BEFORE_INDEX = 0 + + SIZE_NAME_LENGTH = 1 + SIZE_ROW_COLUMN_COUNT = 1 + SIZE_ROW_VAR_COL_OFFSET = 1 + + USAGE_MAP_TABLE_BYTE_LENGTH = 128 + + MAX_COLUMNS_PER_TABLE = 255 + MAX_INDEXES_PER_TABLE = 32 + MAX_TABLE_NAME_LENGTH = 64 + MAX_COLUMN_NAME_LENGTH = 64 + MAX_INDEX_NAME_LENGTH = 64 + + LEGACY_NUMERIC_INDEXES = True + CHARSET = 'cp1252' + DEFAULT_SORT_ORDER = None + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[1] + SIZE_TEXT_FIELD_UNIT = 1 + + + + +class SortOrder(enum.Enum): + """Placeholder for ColumnImpl.SortOrder""" + GENERAL_SORT_ORDER = enum.auto() + GENERAL_97_SORT_ORDER = enum.auto() + GENERAL_LEGACY_SORT_ORDER = enum.auto() + + +# ---------------------------------------------------------------------- +# Jet 4 (Access 2000/02/03 – Jet 4) +# ---------------------------------------------------------------------- +class Jet4Format(BaseFormat): + VERSION_CODE = 0x01 + VERSION_NUMBER = 4 + + name = "4" + read_only = False + indexes_supported = True + codec_type = CodecType.JET + page_size = 4096 + max_database_size = 2 * 1024**3 # 2 GB + + MAX_ROW_SIZE = 4060 + DATA_PAGE_INITIAL_FREE_SPACE = page_size - 14 + + OFFSET_MASKED_HEADER = 24 + HEADER_MASK = BaseFormat.BASE_HEADER_MASK + OFFSET_HEADER_DATE = 114 + OFFSET_PASSWORD = 66 + SIZE_PASSWORD = 40 + OFFSET_SORT_ORDER = 110 + SIZE_SORT_ORDER = 4 + OFFSET_CODE_PAGE = 60 + OFFSET_ENCODING_KEY = 62 + + OFFSET_NEXT_TABLE_DEF_PAGE = 4 + OFFSET_NUM_ROWS = 16 + OFFSET_NEXT_AUTO_NUMBER = 20 + OFFSET_NEXT_COMPLEX_AUTO_NUMBER = -1 + + OFFSET_TABLE_TYPE = 40 + OFFSET_MAX_COLS = 41 + OFFSET_NUM_VAR_COLS = 43 + OFFSET_NUM_COLS = 45 + + OFFSET_NUM_INDEX_SLOTS = 47 + OFFSET_NUM_INDEXES = 51 + OFFSET_OWNED_PAGES = 55 + OFFSET_FREE_SPACE_PAGES = 59 + OFFSET_INDEX_DEF_BLOCK = 63 + + SIZE_INDEX_COLUMN_BLOCK = 52 + SIZE_INDEX_INFO_BLOCK = 28 + + OFFSET_COLUMN_TYPE = 0 + OFFSET_COLUMN_NUMBER = 5 + OFFSET_COLUMN_PRECISION = 11 + OFFSET_COLUMN_SCALE = 12 + OFFSET_COLUMN_SORT_ORDER = 11 + OFFSET_COLUMN_CODE_PAGE = -1 + OFFSET_COLUMN_COMPLEX_ID = -1 + OFFSET_COLUMN_FLAGS = 15 + OFFSET_COLUMN_EXT_FLAGS = 16 + OFFSET_COLUMN_LENGTH = 23 + OFFSET_COLUMN_VARIABLE_TABLE_INDEX = 7 + OFFSET_COLUMN_FIXED_DATA_OFFSET = 21 + OFFSET_COLUMN_FIXED_DATA_ROW_OFFSET = 2 + + OFFSET_TABLE_DEF_LOCATION = 4 + OFFSET_ROW_START = 14 + OFFSET_USAGE_MAP_START = 5 + OFFSET_USAGE_MAP_PAGE_DATA = 4 + OFFSET_REFERENCE_MAP_PAGE_NUMBERS = 1 + + OFFSET_FREE_SPACE = 2 + OFFSET_NUM_ROWS_ON_DATA_PAGE = 12 + MAX_NUM_ROWS_ON_DATA_PAGE = 255 + + OFFSET_INDEX_COMPRESSED_BYTE_COUNT = 24 + OFFSET_INDEX_ENTRY_MASK = 27 + OFFSET_PREV_INDEX_PAGE = 12 + OFFSET_NEXT_INDEX_PAGE = 16 + OFFSET_CHILD_TAIL_INDEX_PAGE = 20 + + SIZE_INDEX_DEFINITION = 12 + SIZE_COLUMN_HEADER = 25 + SIZE_ROW_LOCATION = 2 + SIZE_LONG_VALUE_DEF = 12 + MAX_INLINE_LONG_VALUE_SIZE = 64 + MAX_LONG_VALUE_ROW_SIZE = 4076 + MAX_COMPRESSED_UNICODE_SIZE = 1024 + + SIZE_TDEF_HEADER = 63 + SIZE_TDEF_TRAILER = 2 + SIZE_COLUMN_DEF_BLOCK = 25 + SIZE_INDEX_ENTRY_MASK = 453 + + SKIP_BEFORE_INDEX_FLAGS = 4 + SKIP_AFTER_INDEX_FLAGS = 5 + SKIP_BEFORE_INDEX_SLOT = 4 + SKIP_AFTER_INDEX_SLOT = 4 + SKIP_BEFORE_INDEX = 4 + + SIZE_NAME_LENGTH = 2 + SIZE_ROW_COLUMN_COUNT = 2 + SIZE_ROW_VAR_COL_OFFSET = 2 + + USAGE_MAP_TABLE_BYTE_LENGTH = 64 + + MAX_COLUMNS_PER_TABLE = 255 + MAX_INDEXES_PER_TABLE = 32 + MAX_TABLE_NAME_LENGTH = 64 + MAX_COLUMN_NAME_LENGTH = 64 + MAX_INDEX_NAME_LENGTH = 64 + + LEGACY_NUMERIC_INDEXES = True + CHARSET = "utf-16le" + DEFAULT_SORT_ORDER = SortOrder.GENERAL_97_SORT_ORDER + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[1] + SIZE_TEXT_FIELD_UNIT = 1 + + # from V3_UNSUPP_TYPES: {COMPLEX_TYPE, BIG_INT, EXT_DATE_TIME} + unsupported_data_types = { + DataType.COMPLEX_TYPE, + DataType.BIG_INT, + DataType.EXT_DATE_TIME + } + unsupported_calc_types = set() # no calculated types :contentReference[oaicite:2]{index=2} + + +# ---------------------------------------------------------------------- +# Jet 12 (Access 2007 – ACE12) builds on Jet4 +# ---------------------------------------------------------------------- +class Jet12Format(Jet4Format): + VERSION_CODE = 0x02 + VERSION_NUMBER = 5 + name = "12" + + codec_type = CodecType.OFFICE + legacy_numeric_indexes = False + + # from V12_UNSUPP_TYPES = {BIG_INT, EXT_DATE_TIME} + unsupported_data_types = { + DataType.BIG_INT, + DataType.EXT_DATE_TIME + } + + # only these two offsets changed: + offset_next_complex_auto_number = 28 + offset_column_complex_id = 11 + + # ACE 12 still doesn’t support complex type or calc types + unsupported_calc_types = { + dt for dt in DataType + } # no calculated types :contentReference[oaicite:3]{index=3} + + +# ---------------------------------------------------------------------- +# Jet 14 (Access 2010 – ACE14) inherits from Jet12 +# ---------------------------------------------------------------------- +class Jet14Format(Jet12Format): + VERSION_CODE = 0x03 + VERSION_NUMBER = 2010 + name = "14" + + DEFAULT_SORT_ORDER = SortOrder.GENERAL_SORT_ORDER + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[0] + + # ACE 14 supports the V14_CALC_TYPES: + _V14_CALC = { + DataType.BOOLEAN, DataType.BYTE, DataType.INT, DataType.LONG, + DataType.FLOAT, DataType.DOUBLE, DataType.GUID, + DataType.SHORT_DATE_TIME, DataType.MONEY, DataType.NUMERIC, + DataType.TEXT, DataType.MEMO + } + unsupported_calc_types = set(DataType) - _V14_CALC + + +# ---------------------------------------------------------------------- +# Jet 16 (Access 2013 – ACE16) inherits from Jet14 +# ---------------------------------------------------------------------- +class Jet16Format(Jet14Format): + VERSION_CODE = 0x05 + VERSION_NUMBER = 2013 + name = "16" + + # from V16_UNSUPP_TYPES = {EXT_DATE_TIME} + unsupported_data_types = { + DataType.EXT_DATE_TIME + } + + # ACE 16 adds BIG_INT calc support: + _V16_CALC = Jet14Format._V14_CALC.union({ DataType.BIG_INT }) + unsupported_calc_types = set(DataType) - _V16_CALC + + +# ---------------------------------------------------------------------- +# Jet 17 (Access 2016 – ACE17) inherits from Jet16 +# ---------------------------------------------------------------------- +class Jet17Format(Jet16Format): + VERSION_CODE = 0x06 + VERSION_NUMBER = 2016 + name = "17" + + # now supports everything + unsupported_data_types = set() + unsupported_calc_types = set() + + CHARSET = "utf-16le" # StandardCharsets.UTF_16LE + DEFAULT_SORT_ORDER = SortOrder.GENERAL_LEGACY_SORT_ORDER + PROPERTY_MAP_TYPE = BaseFormat.PROPERTY_MAP_TYPES[0] + SIZE_TEXT_FIELD_UNIT = BaseFormat.TEXT_FIELD_UNIT_SIZE + + +# ---------------------------------------------------------------------- +# MSISAM (Access 95) reuses Jet4 layout but overrides codec & engine‐name +# ---------------------------------------------------------------------- +class MsisamFormat(Jet4Format): + VERSION_CODE = None + ENGINE_NAME_PREFIX = BaseFormat.MSISAM_ENGINE + + name = "MSISAM" + read_only = True + indexes_supported = False + codec_type = CodecType.MSISAM + page_size = 512 + max_database_size = 1 * 1024**2 # 1 MB + + # nothing at all is supported + unsupported_data_types = set(DataType) + unsupported_calc_types = set(DataType) \ No newline at end of file diff --git a/access_parser/parsing_primitives.py b/access_parser/parsing_primitives.py index 9feb9ac..1c31a9b 100644 --- a/access_parser/parsing_primitives.py +++ b/access_parser/parsing_primitives.py @@ -1,5 +1,5 @@ from construct import * - +from construct import Int32ul,Int16ul,Int8ul,Int16ub,Int8ub,Int24ul,Int32sl #explicit imports to help intellisense def version_specific(version, v3_subcon, v4_subcon): """ @@ -125,8 +125,12 @@ def parse_table_head(buffer, version=3): "column_count" / Int16ul, "index_count" / Int32ul, "real_index_count" / Int32ul, - "row_page_map" / Int32ul, - "free_space_page_map" / Int32ul, + "row_page_map_row_number" / Int8ul, + "row_page_map_page_number" / Int24ul, + #"row_page_map" / Int32ul, + "free_space_page_map_row_number" / Int8ul, + "free_space_page_map_page_number" / Int24ul, + #"free_space_page_map" / Int32ul, "tdef_header_end" / Tell).parse(buffer) @@ -259,3 +263,21 @@ def parse_relative_object_metadata_struct(buffer, variable_jump_tables_cnt=0, ve Int16ub)), "var_len_count" / version_specific(version, Int8ub, Int16ub), "relative_metadata_end" / Tell).parse(buffer) + +#helper unpacking function +def parse_buffer_custom(buffer,position,type): + '''Custom function to parse buffers using differet construct types + ------ + Jackcess Mapping for type variable + - get = 'Int8ul' + - getShort = 'Int16ul' + - getInt = 'Int32ul' + - get3ByteInt = 'Int24ul' + #Will add to this list as they come up + ''' + type = globals()[type] + + parser = Struct("value" / type) + buffer = buffer[position:] + result = parser.parse(buffer) + return result.value \ No newline at end of file diff --git a/access_parser/utils.py b/access_parser/utils.py index 0153be2..55c9a10 100644 --- a/access_parser/utils.py +++ b/access_parser/utils.py @@ -21,7 +21,7 @@ TYPE_OLE = 11 TYPE_MEMO = 12 TYPE_GUID = 15 -TYPE_96_bit_17_BYTES = 16 +TYPE_96_BIT_17_BYTES = 16 TYPE_COMPLEX = 18 TABLE_PAGE_MAGIC = b"\x02\x01" @@ -55,11 +55,14 @@ FORMAT_SCIENTIFIC: SCIENTIFIC_DEFAULT } +# Character Encodings for Different Jet Versions + +TEXT_COMPRESSION_HEADER = b'\xff\xfe' # https://stackoverflow.com/questions/45560782 def mdb_date_to_readable(double_time): try: - dtime_bytes = struct.pack("Q", double_time) + dtime_bytes = struct.pack("q", double_time) dtime_double = struct.unpack(' scale: dot_len = len(full_number) - scale full_number = full_number[:dot_len] + "." + full_number[dot_len:] + # if number is smaller than scale then pad the number with relevant leading zeros. + if len(full_number) <= scale: + full_number = '0.' + ('0'*scale + full_number)[-scale:] numeric_string = "-" if neg else "" numeric_string += full_number return numeric_string +###Text type decoding functions +def decodeTextValue(data: bytes, version: int): + """Decodes a compressed or uncompressed text value.""" + + # Jet 3 does not support Unicode compression; decode directly + if version == 3: + return decodeUncompressedText(data, 0, len(data), version) + + # Check for Unicode compression header (Jet 4+ only) + isCompressed = len(data) > 1 and data.startswith(TEXT_COMPRESSION_HEADER) + + if isCompressed: + textBuf = '' + dataStart = len(TEXT_COMPRESSION_HEADER) + dataEnd = dataStart + inCompressedMode = True + + # Process each segment in the compressed data + while dataEnd < len(data): + if data[dataEnd:dataEnd+1] == b'\x00': # End of segment + # Decode the current segment and toggle compression mode + textBuf += decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, version) + inCompressedMode = not inCompressedMode + dataStart = dataEnd + 1 + dataEnd += 1 + + # Handle the last segment + textBuf += decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, version) + return textBuf + + return decodeUncompressedText(data, 0, len(data), version) + + +def decodeTextSegment(data: bytes, dataStart: int, dataEnd: int, inCompressedMode: bool,version: int): + """ + Decodes a segment of a text value into the given buffer according to the + given status of the segment (compressed/uncompressed). + """ + if dataEnd <= dataStart: + return '' # No data in the segment + + if inCompressedMode: + # Extract the relevant segment. + segment = data[dataStart:dataEnd] + # Create a new bytearray twice as long as the segment. + expanded = bytearray(len(segment) * 2) + # Using slice assignment: assign the original bytes to every even index. + # The odd indices will remain 0, which is exactly the padding needed. + expanded[::2] = segment + # Convert the bytearray back to an immutable bytes object. + data = bytes(expanded) + dataStart = 0 + dataEnd = len(data) -def get_decoded_text(bytes_data): + return decodeUncompressedText(data, dataStart, dataEnd, version) + + +def decodeUncompressedText(textBytes: bytes, dataStart: int, dataEnd: int, version: int, strict: bool = False) -> str: + """ + Decodes uncompressed text based on database version. + + :param textBytes: The raw bytes of text. + :param dataStart: Start index of the text segment. + :param dataEnd: End index of the text segment. + :param version: The database version to determine encoding. + :param strict: Whether to raise an error on decoding failure. If False, + decoding errors are logged and replacement characters are used. + :return: Decoded text string. + """ + encoding = version.CHARSET + bytesToDecode = textBytes[dataStart:dataEnd] + try: - decoded = bytes_data.decode('utf-8') - except UnicodeDecodeError: - try: - decoded = bytes_data.decode('latin1') - except UnicodeDecodeError: - decoded = bytes_data.decode('utf-8', errors='ignore') - return decoded + return bytesToDecode.decode(encoding) + except UnicodeDecodeError as e: + message = (f"Decoding error: Data could not be decoded using {encoding}. " + f"Possible corruption or unexpected encoding in the data segment " + f"from {dataStart} to {dataEnd}.") + if strict: + raise ValueError(message) from e + else: + LOGGER.warning(message) + # Return a best-effort result using replacement characters for undecodable bytes + return bytesToDecode.decode(encoding, errors="replace") def parse_money_type(parsed, prop_format): @@ -175,22 +255,10 @@ def parse_type(data_type, buffer, length=None, version=3, props=None): parsed = buffer[:16] guid = uuid.UUID(parsed.hex()) parsed = str(guid) - elif data_type == TYPE_96_bit_17_BYTES: + elif data_type == TYPE_96_BIT_17_BYTES: parsed = buffer[:17] elif data_type == TYPE_TEXT: - if version > 3: - # Looks like if BOM is present text is already decoded - if buffer.startswith(b"\xfe\xff") or buffer.startswith(b"\xff\xfe"): - buff = buffer[2:] - parsed = get_decoded_text(buff) - else: - parsed = buffer.decode("utf-16", errors='ignore') - else: - parsed = get_decoded_text(buffer) - - if "\x00" in parsed: - LOGGER.debug(f"Parsed string contains NUL (0x00) characters: {parsed}") - parsed = parsed.replace("\x00", "") + parsed = decodeTextValue(buffer,version) else: LOGGER.debug(f"parse_type - unsupported data type: {data_type}") return parsed diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c2f43ff --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[project] +name = "access_parser" +version = "0.0.6" +description = "Access database (*.mdb, *.accdb) parser" +readme = "README.md" +authors = [{ name = "Uri Katz", email = "uri.k@claroty.com" }] +license = { text = "Apache Software License" } +requires-python = ">=3.6" +dependencies = [ + "construct", + "tabulate" +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent" +] + +[project.urls] +Homepage = "https://github.com/ClarotyICS/access_parser" + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py deleted file mode 100644 index f5c0879..0000000 --- a/setup.py +++ /dev/null @@ -1,26 +0,0 @@ -import setuptools - -with open("README.md", "r") as f: - long_description = f.read() - -setuptools.setup( - name="access_parser", - version="0.0.6", - author="Uri Katz", - author_email="uri.k@claroty.com", - description="Access database (*.mdb, *.accdb) parser", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/ClarotyICS/access_parser", - packages=setuptools.find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - python_requires='>=3.6', - install_requires=[ - 'construct', - 'tabulate', - ], -)