From 37f6e58018850dfe325ed60e199cbf191316996b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 19:17:43 +0000 Subject: [PATCH 01/15] Add semantic matching for joins specification This spec defines the semantic matching behavior for DataJoint 2.0 joins: - Introduces lineage tracking for attributes (origin table/column) - Defines homologous vs non-homologous namesakes terminology - Specifies matching rules: only homologous namesakes used for joins - Documents ~lineage table architecture for lineage storage - Includes dependency graph fallback for non-DJ schemas - Covers all affected operations: *, &, -, aggr - Removes @ operator in favor of .join(semantic_check=False) - Updates dj.U behavior (& replaces * for table extension) - Includes migration guide and testing strategy --- docs/src/design/semantic-matching-spec.md | 613 ++++++++++++++++++++++ 1 file changed, 613 insertions(+) create mode 100644 docs/src/design/semantic-matching-spec.md diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md new file mode 100644 index 000000000..ff5363cf1 --- /dev/null +++ b/docs/src/design/semantic-matching-spec.md @@ -0,0 +1,613 @@ +# Semantic Matching for Joins - Specification + +## Overview + +This document specifies **semantic matching** for joins in DataJoint 2.0, replacing the current name-based matching rules. Semantic matching ensures that attributes are only matched when they share both the same name and the same **lineage** (origin), preventing accidental joins on unrelated attributes that happen to share names. + +### Goals + +1. **Prevent incorrect joins** on attributes that share names but represent different entities +2. **Enable valid joins** that are currently blocked due to overly restrictive rules +3. **Maintain backward compatibility** for well-designed schemas +4. **Provide clear error messages** when semantic conflicts are detected + +## Problem Statement + +### Current Behavior + +The current join implementation matches attributes purely by name: + +```python +join_attributes = set(n for n in self.heading.names if n in other.heading.names) +``` + +This is essentially a SQL `NATURAL JOIN` - any attributes with the same name in both tables are used for matching. The only constraint is the "join compatibility" check which prevents joining on secondary attributes that appear in both tables. + +**Location**: `src/datajoint/expression.py:301` and `src/datajoint/condition.py:100-120` + +### Problems with Current Approach + +#### Problem 1: Primary Key Name Collision + +Consider two tables: +- `Student(id, name)` - where `id` is the student's primary key +- `Course(id, instructor)` - where `id` is the course's primary key + +**Current behavior**: `Student * Course` joins on `id`, producing meaningless results where student IDs are matched with course IDs. + +**Desired behavior**: An error should be raised because `Student.id` and `Course.id` have different origins (lineages). + +#### Problem 2: Valid Joins Currently Blocked + +Consider two tables: +- `FavoriteCourse(student_id*, course_id)` - student's favorite course (course_id is secondary, with FK to Course) +- `DependentCourse(dep_course_id*, course_id)` - course dependencies (course_id is secondary, with FK to Course) + +**Current behavior**: `FavoriteCourse * DependentCourse` is **rejected** because `course_id` is a secondary attribute in both tables. + +**Desired behavior**: The join should proceed because both `course_id` attributes share the same lineage (tracing to `Course.course_id`). + +## Key Concepts + +### Terminology + +| Term | Definition | +|------|------------| +| **Homologous attributes** | Attributes with the same lineage (whether or not they have the same name) | +| **Namesake attributes** | Attributes with the same name (whether or not they have the same lineage) | +| **Homologous namesakes** | Attributes with the same name AND the same lineage — used for join matching | +| **Non-homologous namesakes** | Attributes with the same name BUT different lineage — cause join errors | + +### Attribute Lineage + +Lineage identifies the **origin** of an attribute - where it was first defined. It is represented as a string in the format: + +``` +`schema_name`.`table_name`.`attribute_name` +``` + +#### Lineage Assignment Rules + +1. **Native primary key attributes** have lineage: + ``` + lineage = "`this_schema`.`this_table`.`attr_name`" + ``` + The table where they are originally defined. + +2. **Attributes inherited via foreign key** retain their origin lineage: + ``` + lineage = "`parent_schema`.`parent_table`.`attr_name`" + ``` + Traced to the original definition through the FK chain. + +3. **Native secondary attributes** do NOT have lineage: + ``` + lineage = None + ``` + Secondary attributes defined directly (not via FK) cannot be used for join matching. + +#### Lineage Propagation + +Lineage propagates through: + +1. **Foreign key references**: Inherited attributes retain their origin lineage regardless of whether they end up as primary or secondary in the referencing table. + +2. **Query expressions**: + - Projections preserve lineage for included attributes + - Renamed attributes (`new_name='old_name'`) preserve lineage + - Computed attributes (`result='expr'`) have no lineage + +### Semantic Matching Rules + +#### Match Decision Matrix + +| Scenario | Same Name | Same Lineage | Action | +|----------|-----------|--------------|--------| +| Homologous namesakes | Yes | Yes | **Match** - use for join | +| Non-homologous namesakes (both have lineage) | Yes | No (different) | **Error** | +| Non-homologous namesakes (both null lineage) | Yes | No (both null) | **Error** | +| Non-homologous namesakes (one null) | Yes | No (one null) | **Error** | +| Different names | No | - | **No match** | + +#### Error Resolution + +When non-homologous namesakes are detected, users must resolve the conflict using `.proj()` to rename one of the colliding attributes: + +```python +# Error: Student.id and Course.id have different lineages +Student * Course # DataJointError! + +# Resolution: rename one attribute +Student * Course.proj(course_id='id') # OK +``` + +## Affected Operations + +Semantic matching applies to all binary operations that match attributes: + +| Operator | Operation | Semantic Matching | +|----------|-----------|-------------------| +| `A * B` | Join | Matches on homologous namesakes | +| `A & B` | Restriction | Matches on homologous namesakes | +| `A - B` | Anti-restriction | Matches on homologous namesakes | +| `A.aggr(B, ...)` | Aggregation | Matches on homologous namesakes | + +### The `.join()` Method + +The `.join()` method provides additional control: + +```python +# Default: semantic checking enabled (same as *) +result = A.join(B) + +# Bypass semantic check for legacy compatibility +result = A.join(B, semantic_check=False) +``` + +### Removal of `@` Operator + +The `@` operator (permissive join) is **removed** in DataJoint 2.0: + +```python +# Old (deprecated): +A @ B # Raises DataJointError with migration guidance + +# New: +A.join(B, semantic_check=False) # Explicit bypass +``` + +The error message directs users to the explicit `.join()` method. + +## Universal Set `dj.U` + +`dj.U(attr1, ..., attrn)` represents the universal set of all possible values and lineages. + +### Homology with `dj.U` + +Since `dj.U` conceptually contains all possible lineages, its attributes are **homologous to any namesake attribute** in other expressions. + +### Valid Operations + +```python +# Restriction: promotes a, b to PK; lineage transferred from A +dj.U('a', 'b') & A + +# Aggregation: groups by a, b +dj.U('a', 'b').aggr(A, count='count(*)') +``` + +### Invalid Operations + +```python +# Anti-restriction: produces infinite set +dj.U('a', 'b') - A # DataJointError + +# Join: deprecated, use & instead +dj.U('a', 'b') * A # DataJointError with migration guidance +``` + +## Implementation Architecture + +### Two Methods for Lineage Determination + +The implementation provides **two separate methods** for determining attribute lineage: + +#### Method 1: Lineage Tables (`~lineage`) + +For DataJoint-managed schemas: + +- Lineage is stored explicitly in a hidden table (`~lineage`) per schema +- Populated at table declaration time by copying from parent tables +- Fast O(1) lookup at query time +- Authoritative source when present + +**Schema**: +```sql +CREATE TABLE `schema_name`.`~lineage` ( + table_name VARCHAR(64) NOT NULL, + attribute_name VARCHAR(64) NOT NULL, + lineage VARCHAR(255), -- NULL for native secondary attrs + PRIMARY KEY (table_name, attribute_name) +); +``` + +#### Method 2: Dependency Graph Traversal + +Fallback for non-DataJoint schemas or when `~lineage` doesn't exist: + +- Lineage computed by traversing FK relationships +- Uses `connection.dependencies` which loads from `INFORMATION_SCHEMA` +- Works with any database schema +- May be incomplete if upstream schemas aren't loaded + +**Algorithm**: +```python +def compute_lineage(table, attribute): + """Compute lineage by FK traversal.""" + # Check if attribute is inherited via FK + for parent, props in dependencies.parents(table).items(): + attr_map = props['attr_map'] + if attribute in attr_map: + parent_attr = attr_map[attribute] + # Recursively trace to origin + return compute_lineage(parent, parent_attr) + + # Not inherited - check if primary key + if attribute in table.primary_key: + return f"`{schema}`.`{table}`.`{attribute}`" + + # Native secondary - no lineage + return None +``` + +### Selection Logic + +These methods are **mutually exclusive**: + +```python +def get_lineage(schema, table, attribute): + if lineage_table_exists(schema): + return query_lineage_table(schema, table, attribute) + else: + return compute_from_dependencies(schema, table, attribute) +``` + +## Changes to Existing Code + +### `Attribute` Class (`heading.py`) + +Add `lineage` field to the `Attribute` namedtuple: + +```python +default_attribute_properties = dict( + # ... existing fields ... + lineage=None, # NEW: Origin of attribute, e.g. "`schema`.`table`.`attr`" +) +``` + +### `Heading` Class (`heading.py`) + +1. **Load lineage when fetching heading from database**: + - Query `~lineage` table if it exists + - Fall back to dependency graph computation + +2. **Preserve lineage in `select()` method**: + - Included attributes keep their lineage + - Renamed attributes keep their lineage + - Computed attributes have `lineage=None` + +3. **Merge lineage in `join()` method**: + - Verify homologous namesakes have matching lineage + - Combined heading includes lineage from both sides + +### `assert_join_compatibility()` (`condition.py`) + +Replace current implementation with semantic matching: + +```python +def assert_join_compatibility(expr1, expr2): + """ + Check semantic compatibility of two expressions for joining. + + Raises DataJointError if non-homologous namesakes are detected. + """ + if isinstance(expr1, U) or isinstance(expr2, U): + return # U is always compatible + + # Find namesake attributes (same name in both) + namesakes = set(expr1.heading.names) & set(expr2.heading.names) + + for name in namesakes: + lineage1 = expr1.heading[name].lineage + lineage2 = expr2.heading[name].lineage + + if lineage1 != lineage2: + raise DataJointError( + f"Cannot join on attribute `{name}`: " + f"different lineages ({lineage1} vs {lineage2}). " + f"Use .proj() to rename one of the attributes." + ) +``` + +### `join()` Method (`expression.py`) + +Update to use semantic matching: + +```python +def join(self, other, semantic_check=True, left=False): + # ... existing setup ... + + if semantic_check: + assert_join_compatibility(self, other) + + # Find homologous namesakes for join + join_attributes = set() + for name in self.heading.names: + if name in other.heading.names: + # Only join on attributes with matching lineage + if self.heading[name].lineage == other.heading[name].lineage: + join_attributes.add(name) + + # ... rest of join logic ... +``` + +### `@` Operator Removal (`expression.py`) + +```python +def __matmul__(self, other): + """Removed: Use .join(other, semantic_check=False) instead.""" + raise DataJointError( + "The @ operator has been removed in DataJoint 2.0. " + "Use .join(other, semantic_check=False) for permissive joins." + ) +``` + +## Lineage Table Population + +### At Table Declaration Time + +When a table is declared, populate the `~lineage` table: + +```python +def declare_table(table_class, context): + # ... parse definition ... + + lineage_entries = [] + + for attr in definition.attributes: + if attr.from_foreign_key: + # Inherited: copy parent's lineage + parent_lineage = get_lineage( + attr.fk_schema, attr.fk_table, attr.fk_attribute + ) + lineage_entries.append((table_name, attr.name, parent_lineage)) + elif attr.in_key: + # Native primary key: this table is the origin + lineage_entries.append(( + table_name, attr.name, + f"`{schema}`.`{table_name}`.`{attr.name}`" + )) + else: + # Native secondary: no lineage + lineage_entries.append((table_name, attr.name, None)) + + # Insert into ~lineage table + insert_lineage_entries(schema, lineage_entries) +``` + +### Migration for Existing Tables + +For existing schemas without `~lineage` tables: + +1. **Automatic creation**: When DataJoint accesses a schema, check if `~lineage` exists +2. **Lazy population**: Populate entries as tables are accessed +3. **Bulk migration tool**: Provide utility to migrate entire schema + +```python +def migrate_schema_lineage(schema): + """Populate ~lineage table for all tables in schema.""" + create_lineage_table_if_not_exists(schema) + + for table in schema.list_tables(): + populate_lineage_from_dependencies(schema, table) +``` + +## Query Expression Lineage Propagation + +### Projection (`proj`) + +```python +def proj(self, *attributes, **named_attributes): + # ... existing logic ... + + # Lineage handling in select(): + # - Included attributes: preserve lineage + # - Renamed (new_name='old_name'): preserve old_name's lineage + # - Computed (new_name='expr'): lineage = None +``` + +### Aggregation (`aggr`) + +Aggregation creates a new expression with: +- Group attributes retain their lineage from the group operand +- Aggregated attributes have `lineage=None` (they are computations) + +### Union (`+`) + +Union requires all namesake attributes to have matching lineage (enforced via `assert_join_compatibility`). + +## Error Messages + +### Non-Homologous Namesakes + +``` +DataJointError: Cannot join on attribute `id`: different lineages +(`university`.`Student`.`id` vs `university`.`Course`.`id`). +Use .proj() to rename one of the attributes. +``` + +### Deprecated `@` Operator + +``` +DataJointError: The @ operator has been removed in DataJoint 2.0. +Use .join(other, semantic_check=False) for permissive joins. +``` + +### Deprecated `dj.U * table` + +``` +DataJointError: dj.U(...) * table is deprecated in DataJoint 2.0. +Use dj.U(...) & table instead. +``` + +## Testing Strategy + +### Unit Tests + +1. **Lineage computation tests**: + - Native PK attribute has correct lineage + - FK-inherited attribute traces to origin + - Native secondary attribute has null lineage + - Multi-hop FK inheritance traces correctly + +2. **Join matching tests**: + - Homologous namesakes are matched + - Non-homologous namesakes raise error + - `semantic_check=False` bypasses check + +3. **Projection lineage preservation**: + - Included attributes keep lineage + - Renamed attributes keep lineage + - Computed attributes have null lineage + +4. **`dj.U` compatibility**: + - `dj.U & table` works + - `dj.U.aggr(table, ...)` works + - `dj.U - table` raises error + - `dj.U * table` raises deprecation error + +### Integration Tests + +1. **Schema migration**: Existing schema gets `~lineage` table populated correctly +2. **Cross-schema joins**: Lineage traced across schema boundaries +3. **Complex queries**: Multi-join expressions with various lineage scenarios + +### Regression Tests + +Ensure existing well-designed schemas continue to work without modification. + +## Migration Guide + +### For Users + +1. **Review joins on generic attribute names**: Attributes like `id`, `name`, `value`, `type` may trigger non-homologous namesake errors. + +2. **Replace `@` operator**: + ```python + # Old + table1 @ table2 + + # New + table1.join(table2, semantic_check=False) + ``` + +3. **Replace `dj.U * table`**: + ```python + # Old + dj.U('attr') * table + + # New + dj.U('attr') & table + ``` + +4. **Resolve namesake conflicts**: + ```python + # If error on Student * Course (both have 'id') + Student * Course.proj(course_id='id') + ``` + +### For Schema Designers + +1. **Use descriptive attribute names**: Prefer `student_id` over `id` to avoid collisions. + +2. **Leverage foreign keys**: Inherited attributes maintain lineage, enabling semantic joins. + +3. **Run migration tool**: Use `dj.migrate_lineage(schema)` to populate lineage tables for existing schemas. + +## Performance Considerations + +### Lineage Table Lookup + +- O(1) lookup per attribute +- Cached in `Heading` object +- No additional queries during normal operations + +### Dependency Graph Fallback + +- First access loads full dependency graph for schema +- Lineage computation is O(depth) per attribute +- Results cached to avoid recomputation + +### Join Operations + +- Lineage comparison adds negligible overhead +- Same attribute matching loop, just with additional comparison + +## Future Considerations + +### Lineage in Query Cache + +Query cache keys should include lineage information to prevent cache collisions between semantically different queries. + +### Cross-Database Lineage + +For schemas spanning multiple databases, lineage format may need to include database identifier: + +``` +`database`.`schema`.`table`.`attribute` +``` + +### Lineage Visualization + +Extend ERD diagrams to show lineage relationships, helping users understand attribute origins. + +## Appendix: Lineage Examples + +### Example 1: Simple FK Chain + +``` +Session(session_id*, date) + ↓ FK +Trial(session_id*, trial_num*, stimulus) + ↓ FK +Response(session_id*, trial_num*, response_time) +``` + +Lineages: +- `Session.session_id` → `"university"."Session"."session_id"` +- `Trial.session_id` → `"university"."Session"."session_id"` (inherited) +- `Trial.trial_num` → `"university"."Trial"."trial_num"` (native PK) +- `Response.session_id` → `"university"."Session"."session_id"` (inherited) +- `Response.trial_num` → `"university"."Trial"."trial_num"` (inherited) + +### Example 2: Secondary FK + +``` +Course(course_id*, title) + ↓ FK (secondary) +Enrollment(student_id*, course_id) +``` + +Lineages: +- `Course.course_id` → `"university"."Course"."course_id"` +- `Enrollment.student_id` → `"university"."Enrollment"."student_id"` (native PK) +- `Enrollment.course_id` → `"university"."Course"."course_id"` (inherited via FK) + +### Example 3: Aliased FK + +``` +Person(person_id*, name) + ↓ FK (aliased) +Marriage(husband*, wife*, date) + where husband -> Person, wife -> Person +``` + +Lineages: +- `Person.person_id` → `"family"."Person"."person_id"` +- `Marriage.husband` → `"family"."Person"."person_id"` (aliased FK) +- `Marriage.wife` → `"family"."Person"."person_id"` (aliased FK) + +Note: `husband` and `wife` have the **same lineage** even though different names. + +### Example 4: Non-Homologous Namesakes + +``` +Student(id*, name) -- id is native PK +Course(id*, title) -- id is native PK +``` + +Lineages: +- `Student.id` → `"university"."Student"."id"` +- `Course.id` → `"university"."Course"."id"` + +`Student * Course` → **Error**: non-homologous namesakes (`id` has different lineages) From b6be2184a16ff3d17abebb75481e7ce1aee22cf3 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 19:34:46 +0000 Subject: [PATCH 02/15] Remove backticks from lineage format in spec Lineage strings now use plain dot notation: schema.table.attribute instead of quoted format: `schema`.`table`.`attribute` --- docs/src/design/semantic-matching-spec.md | 42 +++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index ff5363cf1..69c0c0049 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -63,20 +63,20 @@ Consider two tables: Lineage identifies the **origin** of an attribute - where it was first defined. It is represented as a string in the format: ``` -`schema_name`.`table_name`.`attribute_name` +schema_name.table_name.attribute_name ``` #### Lineage Assignment Rules 1. **Native primary key attributes** have lineage: ``` - lineage = "`this_schema`.`this_table`.`attr_name`" + lineage = "this_schema.this_table.attr_name" ``` The table where they are originally defined. 2. **Attributes inherited via foreign key** retain their origin lineage: ``` - lineage = "`parent_schema`.`parent_table`.`attr_name`" + lineage = "parent_schema.parent_table.attr_name" ``` Traced to the original definition through the FK chain. @@ -234,7 +234,7 @@ def compute_lineage(table, attribute): # Not inherited - check if primary key if attribute in table.primary_key: - return f"`{schema}`.`{table}`.`{attribute}`" + return f"{schema}.{table}.{attribute}" # Native secondary - no lineage return None @@ -261,7 +261,7 @@ Add `lineage` field to the `Attribute` namedtuple: ```python default_attribute_properties = dict( # ... existing fields ... - lineage=None, # NEW: Origin of attribute, e.g. "`schema`.`table`.`attr`" + lineage=None, # NEW: Origin of attribute, e.g. "schema.table.attr" ) ``` @@ -365,7 +365,7 @@ def declare_table(table_class, context): # Native primary key: this table is the origin lineage_entries.append(( table_name, attr.name, - f"`{schema}`.`{table_name}`.`{attr.name}`" + f"{schema}.{table_name}.{attr.name}" )) else: # Native secondary: no lineage @@ -422,7 +422,7 @@ Union requires all namesake attributes to have matching lineage (enforced via `a ``` DataJointError: Cannot join on attribute `id`: different lineages -(`university`.`Student`.`id` vs `university`.`Course`.`id`). +(university.Student.id vs university.Course.id). Use .proj() to rename one of the attributes. ``` @@ -544,7 +544,7 @@ Query cache keys should include lineage information to prevent cache collisions For schemas spanning multiple databases, lineage format may need to include database identifier: ``` -`database`.`schema`.`table`.`attribute` +database.schema.table.attribute ``` ### Lineage Visualization @@ -564,11 +564,11 @@ Response(session_id*, trial_num*, response_time) ``` Lineages: -- `Session.session_id` → `"university"."Session"."session_id"` -- `Trial.session_id` → `"university"."Session"."session_id"` (inherited) -- `Trial.trial_num` → `"university"."Trial"."trial_num"` (native PK) -- `Response.session_id` → `"university"."Session"."session_id"` (inherited) -- `Response.trial_num` → `"university"."Trial"."trial_num"` (inherited) +- `Session.session_id` → `university.Session.session_id` +- `Trial.session_id` → `university.Session.session_id` (inherited) +- `Trial.trial_num` → `university.Trial.trial_num` (native PK) +- `Response.session_id` → `university.Session.session_id` (inherited) +- `Response.trial_num` → `university.Trial.trial_num` (inherited) ### Example 2: Secondary FK @@ -579,9 +579,9 @@ Enrollment(student_id*, course_id) ``` Lineages: -- `Course.course_id` → `"university"."Course"."course_id"` -- `Enrollment.student_id` → `"university"."Enrollment"."student_id"` (native PK) -- `Enrollment.course_id` → `"university"."Course"."course_id"` (inherited via FK) +- `Course.course_id` → `university.Course.course_id` +- `Enrollment.student_id` → `university.Enrollment.student_id` (native PK) +- `Enrollment.course_id` → `university.Course.course_id` (inherited via FK) ### Example 3: Aliased FK @@ -593,9 +593,9 @@ Marriage(husband*, wife*, date) ``` Lineages: -- `Person.person_id` → `"family"."Person"."person_id"` -- `Marriage.husband` → `"family"."Person"."person_id"` (aliased FK) -- `Marriage.wife` → `"family"."Person"."person_id"` (aliased FK) +- `Person.person_id` → `family.Person.person_id` +- `Marriage.husband` → `family.Person.person_id` (aliased FK) +- `Marriage.wife` → `family.Person.person_id` (aliased FK) Note: `husband` and `wife` have the **same lineage** even though different names. @@ -607,7 +607,7 @@ Course(id*, title) -- id is native PK ``` Lineages: -- `Student.id` → `"university"."Student"."id"` -- `Course.id` → `"university"."Course"."id"` +- `Student.id` → `university.Student.id` +- `Course.id` → `university.Course.id` `Student * Course` → **Error**: non-homologous namesakes (`id` has different lineages) From a81d4be1bd32fd84c37fb184c737febf3ac66525 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 19:51:43 +0000 Subject: [PATCH 03/15] Update lineage table design in spec - Only store attributes WITH lineage (native secondary attrs have no entry) - Make lineage column NOT NULL - Add cleanup on table creation (remove leftover entries) - Add cleanup on table drop --- docs/src/design/semantic-matching-spec.md | 30 +++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index 69c0c0049..81e5c14e0 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -198,6 +198,7 @@ For DataJoint-managed schemas: - Lineage is stored explicitly in a hidden table (`~lineage`) per schema - Populated at table declaration time by copying from parent tables +- Only attributes WITH lineage are stored (native secondary attributes have no entry) - Fast O(1) lookup at query time - Authoritative source when present @@ -206,11 +207,15 @@ For DataJoint-managed schemas: CREATE TABLE `schema_name`.`~lineage` ( table_name VARCHAR(64) NOT NULL, attribute_name VARCHAR(64) NOT NULL, - lineage VARCHAR(255), -- NULL for native secondary attrs + lineage VARCHAR(255) NOT NULL, PRIMARY KEY (table_name, attribute_name) ); ``` +**Lifecycle**: +- On table creation: delete any existing entries for that table, then insert new entries +- On table drop: delete all entries for that table + #### Method 2: Dependency Graph Traversal Fallback for non-DataJoint schemas or when `~lineage` doesn't exist: @@ -247,6 +252,7 @@ These methods are **mutually exclusive**: ```python def get_lineage(schema, table, attribute): if lineage_table_exists(schema): + # Returns lineage string if entry exists, None otherwise return query_lineage_table(schema, table, attribute) else: return compute_from_dependencies(schema, table, attribute) @@ -352,6 +358,9 @@ When a table is declared, populate the `~lineage` table: def declare_table(table_class, context): # ... parse definition ... + # Remove any leftover entries from previous declaration + delete_lineage_entries(schema, table_name) + lineage_entries = [] for attr in definition.attributes: @@ -360,21 +369,32 @@ def declare_table(table_class, context): parent_lineage = get_lineage( attr.fk_schema, attr.fk_table, attr.fk_attribute ) - lineage_entries.append((table_name, attr.name, parent_lineage)) + if parent_lineage: # Only store if parent has lineage + lineage_entries.append((table_name, attr.name, parent_lineage)) elif attr.in_key: # Native primary key: this table is the origin lineage_entries.append(( table_name, attr.name, f"{schema}.{table_name}.{attr.name}" )) - else: - # Native secondary: no lineage - lineage_entries.append((table_name, attr.name, None)) + # Native secondary attributes: no entry (no lineage) # Insert into ~lineage table insert_lineage_entries(schema, lineage_entries) ``` +### At Table Drop Time + +When a table is dropped, remove its lineage entries: + +```python +def drop_table(table_class): + # ... drop the table ... + + # Clean up lineage entries + delete_lineage_entries(schema, table_name) +``` + ### Migration for Existing Tables For existing schemas without `~lineage` tables: From c597c52cc2dcf6ae39de6b3c0ce384e2fc2bda0a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:02:33 +0000 Subject: [PATCH 04/15] Clarify dj.U syntax and database table names in lineage - Fix dj.U syntax: dj.U() or dj.U('attr1', 'attr2', ...) - Add dj.U() & A example for empty U - Clarify lineage uses database table names (snake_case with tier prefixes) not Python class names (CamelCase) - Update all lineage examples to use proper database table names - Remove cross-database lineage section (three-part lineage is sufficient) - Change "ERD" to "DataJoint diagrams" --- docs/src/design/semantic-matching-spec.md | 71 ++++++++++++----------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index 81e5c14e0..bd698e250 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -66,6 +66,12 @@ Lineage identifies the **origin** of an attribute - where it was first defined. schema_name.table_name.attribute_name ``` +**Note**: `table_name` refers to the actual database table name, not the Python class name. DataJoint converts class names (CamelCase) to table names (snake_case) with tier prefixes: +- `Session` → `session` (manual table) +- `#SessionType` → `#session_type` (lookup table) +- `_ProcessingTask` → `_processing_task` (imported table) +- `__ProcessedData` → `__processed_data` (computed table) + #### Lineage Assignment Rules 1. **Native primary key attributes** have lineage: @@ -160,7 +166,7 @@ The error message directs users to the explicit `.join()` method. ## Universal Set `dj.U` -`dj.U(attr1, ..., attrn)` represents the universal set of all possible values and lineages. +`dj.U()` or `dj.U('attr1', 'attr2', ...)` represents the universal set of all possible values and lineages. ### Homology with `dj.U` @@ -174,6 +180,9 @@ dj.U('a', 'b') & A # Aggregation: groups by a, b dj.U('a', 'b').aggr(A, count='count(*)') + +# Empty U for distinct primary keys +dj.U() & A ``` ### Invalid Operations @@ -442,7 +451,7 @@ Union requires all namesake attributes to have matching lineage (enforced via `a ``` DataJointError: Cannot join on attribute `id`: different lineages -(university.Student.id vs university.Course.id). +(university.student.id vs university.course.id). Use .proj() to rename one of the attributes. ``` @@ -559,75 +568,69 @@ Ensure existing well-designed schemas continue to work without modification. Query cache keys should include lineage information to prevent cache collisions between semantically different queries. -### Cross-Database Lineage - -For schemas spanning multiple databases, lineage format may need to include database identifier: - -``` -database.schema.table.attribute -``` - ### Lineage Visualization -Extend ERD diagrams to show lineage relationships, helping users understand attribute origins. +Extend DataJoint diagrams to show lineage relationships, helping users understand attribute origins. ## Appendix: Lineage Examples +These examples show Python class names with their corresponding database table names (in lineage strings). + ### Example 1: Simple FK Chain ``` -Session(session_id*, date) +Session(session_id*, date) # table: session ↓ FK -Trial(session_id*, trial_num*, stimulus) +Trial(session_id*, trial_num*, stimulus) # table: trial ↓ FK -Response(session_id*, trial_num*, response_time) +Response(session_id*, trial_num*, response_time) # table: __response (computed) ``` -Lineages: -- `Session.session_id` → `university.Session.session_id` -- `Trial.session_id` → `university.Session.session_id` (inherited) -- `Trial.trial_num` → `university.Trial.trial_num` (native PK) -- `Response.session_id` → `university.Session.session_id` (inherited) -- `Response.trial_num` → `university.Trial.trial_num` (inherited) +Lineages (using database table names): +- `Session.session_id` → `university.session.session_id` +- `Trial.session_id` → `university.session.session_id` (inherited) +- `Trial.trial_num` → `university.trial.trial_num` (native PK) +- `Response.session_id` → `university.session.session_id` (inherited) +- `Response.trial_num` → `university.trial.trial_num` (inherited) ### Example 2: Secondary FK ``` -Course(course_id*, title) +Course(course_id*, title) # table: course ↓ FK (secondary) -Enrollment(student_id*, course_id) +Enrollment(student_id*, course_id) # table: enrollment ``` Lineages: -- `Course.course_id` → `university.Course.course_id` -- `Enrollment.student_id` → `university.Enrollment.student_id` (native PK) -- `Enrollment.course_id` → `university.Course.course_id` (inherited via FK) +- `Course.course_id` → `university.course.course_id` +- `Enrollment.student_id` → `university.enrollment.student_id` (native PK) +- `Enrollment.course_id` → `university.course.course_id` (inherited via FK) ### Example 3: Aliased FK ``` -Person(person_id*, name) +Person(person_id*, name) # table: person ↓ FK (aliased) -Marriage(husband*, wife*, date) +Marriage(husband*, wife*, date) # table: __marriage (computed) where husband -> Person, wife -> Person ``` Lineages: -- `Person.person_id` → `family.Person.person_id` -- `Marriage.husband` → `family.Person.person_id` (aliased FK) -- `Marriage.wife` → `family.Person.person_id` (aliased FK) +- `Person.person_id` → `family.person.person_id` +- `Marriage.husband` → `family.person.person_id` (aliased FK) +- `Marriage.wife` → `family.person.person_id` (aliased FK) Note: `husband` and `wife` have the **same lineage** even though different names. ### Example 4: Non-Homologous Namesakes ``` -Student(id*, name) -- id is native PK -Course(id*, title) -- id is native PK +Student(id*, name) -- id is native PK, table: student +Course(id*, title) -- id is native PK, table: course ``` Lineages: -- `Student.id` → `university.Student.id` -- `Course.id` → `university.Course.id` +- `Student.id` → `university.student.id` +- `Course.id` → `university.course.id` `Student * Course` → **Error**: non-homologous namesakes (`id` has different lineages) From 0c593c824d10c85e113c4190c941b036b2053eab Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:11:29 +0000 Subject: [PATCH 05/15] Implement semantic matching for joins This implements semantic matching for DataJoint 2.0 joins as specified in docs/src/design/semantic-matching-spec.md. Key changes: 1. Lineage tracking: - Add `lineage` field to Attribute class (heading.py) - Create lineage.py module for ~lineage table management - Populate lineage at table declaration time - Clean up lineage entries when tables are dropped - Load lineage from database when fetching headings 2. Semantic matching in joins: - Update assert_join_compatibility() to check for non-homologous namesakes - Update join() to only match on homologous namesakes (same name AND lineage) - Lineage is preserved through projections and renames 3. API changes: - Remove @ operator (raises error directing to .join(semantic_check=False)) - dj.U * table raises deprecation error (use dj.U & table instead) - dj.U - table raises error (infinite set) - dj.U is always compatible (contains all possible lineages) 4. Tests: - Add comprehensive tests for lineage tracking - Test homologous and non-homologous namesake handling - Test deprecated operator errors - Test dj.U operations with semantic matching --- src/datajoint/condition.py | 34 +++- src/datajoint/expression.py | 84 +++++--- src/datajoint/heading.py | 7 + src/datajoint/lineage.py | 177 +++++++++++++++++ src/datajoint/table.py | 37 ++++ tests/test_semantic_matching.py | 334 ++++++++++++++++++++++++++++++++ 6 files changed, 636 insertions(+), 37 deletions(-) create mode 100644 src/datajoint/lineage.py create mode 100644 tests/test_semantic_matching.py diff --git a/src/datajoint/condition.py b/src/datajoint/condition.py index 8a22d17bb..79c8e69be 100644 --- a/src/datajoint/condition.py +++ b/src/datajoint/condition.py @@ -97,10 +97,13 @@ def __init__(self, restriction): def assert_join_compatibility(expr1, expr2): """ - Determine if expressions expr1 and expr2 are join-compatible. To be join-compatible, - the matching attributes in the two expressions must be in the primary key of one or the - other expression. - Raises an exception if not compatible. + Check semantic compatibility of two expressions for joining. + + Uses semantic matching: attributes are only matched when they share both + the same name AND the same lineage (origin). + + Raises DataJointError if non-homologous namesakes are detected (same name + but different lineage). :param expr1: A QueryExpression object :param expr2: A QueryExpression object @@ -110,14 +113,25 @@ def assert_join_compatibility(expr1, expr2): for rel in (expr1, expr2): if not isinstance(rel, (U, QueryExpression)): raise DataJointError("Object %r is not a QueryExpression and cannot be joined." % rel) - if not isinstance(expr1, U) and not isinstance(expr2, U): # dj.U is always compatible - try: + + # dj.U is always compatible - it contains all possible lineages + if isinstance(expr1, U) or isinstance(expr2, U): + return + + # Find namesake attributes (same name in both expressions) + namesakes = set(expr1.heading.names) & set(expr2.heading.names) + + for name in namesakes: + lineage1 = expr1.heading[name].lineage + lineage2 = expr2.heading[name].lineage + + # Non-homologous namesakes: same name, different lineage + if lineage1 != lineage2: raise DataJointError( - "Cannot join query expressions on dependent attribute `%s`" - % next(r for r in set(expr1.heading.secondary_attributes).intersection(expr2.heading.secondary_attributes)) + f"Cannot join on attribute `{name}`: different lineages " + f"({lineage1} vs {lineage2}). " + f"Use .proj() to rename one of the attributes." ) - except StopIteration: - pass # all ok def make_condition(query_expression, condition, columns): diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index 17d529ff8..69b92163d 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -275,30 +275,45 @@ def __mul__(self, other): def __matmul__(self, other): """ - Permissive join of query expressions `self` and `other` ignoring compatibility check - e.g. ``q1 @ q2``. + The @ operator has been removed in DataJoint 2.0. + Use .join(other, semantic_check=False) for permissive joins. """ - if inspect.isclass(other) and issubclass(other, QueryExpression): - other = other() # instantiate - return self.join(other, semantic_check=False) + raise DataJointError( + "The @ operator has been removed in DataJoint 2.0. " + "Use .join(other, semantic_check=False) for permissive joins." + ) def join(self, other, semantic_check=True, left=False): """ - create the joined QueryExpression. - a * b is short for A.join(B) - a @ b is short for A.join(B, semantic_check=False) - Additionally, left=True will retain the rows of self, effectively performing a left join. + Create the joined QueryExpression. + + Uses semantic matching: only attributes with the same name AND the same + lineage (homologous namesakes) are used for joining. + + :param other: QueryExpression to join with + :param semantic_check: If True (default), raise error on non-homologous namesakes. + If False, bypass semantic check (use for legacy compatibility). + :param left: If True, perform a left join retaining all rows from self. + + Examples: + a * b is short for a.join(b) + a.join(b, semantic_check=False) for permissive joins """ - # trigger subqueries if joining on renamed attributes + # Handle U objects: redirect to U's restriction operation if isinstance(other, U): - return other * self + return other & self if inspect.isclass(other) and issubclass(other, QueryExpression): other = other() # instantiate if not isinstance(other, QueryExpression): raise DataJointError("The argument of join must be a QueryExpression") if semantic_check: assert_join_compatibility(self, other) - join_attributes = set(n for n in self.heading.names if n in other.heading.names) + # Only join on homologous namesakes (same name AND same lineage) + join_attributes = set( + n + for n in self.heading.names + if n in other.heading.names and self.heading[n].lineage == other.heading[n].lineage + ) # needs subquery if self's FROM clause has common attributes with other's FROM clause need_subquery1 = need_subquery2 = bool( (set(self.original_heading.names) & set(other.original_heading.names)) - join_attributes @@ -735,9 +750,9 @@ class U: """ dj.U objects are the universal sets representing all possible values of their attributes. dj.U objects cannot be queried on their own but are useful for forming some queries. - dj.U('attr1', ..., 'attrn') represents the universal set with the primary key attributes attr1 ... attrn. - The universal set is the set of all possible combinations of values of the attributes. - Without any attributes, dj.U() represents the set with one element that has no attributes. + dj.U() or dj.U('attr1', ..., 'attrn') represents the universal set with the primary key + attributes attr1 ... attrn. Without any attributes, dj.U() represents the set with one + element that has no attributes. Restriction: @@ -747,11 +762,15 @@ class U: >>> dj.U('contrast', 'brightness') & stimulus + Empty U for distinct primary keys: + + >>> dj.U() & expr + Aggregation: In aggregation, dj.U is used for summary calculation over an entire set: - The following expression yields one element with one attribute `s` containing the total number of elements in + The following expression yields one element with one attribute `n` containing the total number of elements in query expression `expr`: >>> dj.U().aggr(expr, n='count(*)') @@ -760,7 +779,7 @@ class U: query expression `expr`. >>> dj.U().aggr(expr, n='count(distinct attr)') - >>> dj.U().aggr(dj.U('attr').aggr(expr), 'n=count(*)') + >>> dj.U().aggr(dj.U('attr').aggr(expr), n='count(*)') The following expression yields one element and one attribute `s` containing the sum of values of attribute `attr` over entire result set of expression `expr`: @@ -770,16 +789,13 @@ class U: The following expression yields the set of all unique combinations of attributes `attr1`, `attr2` and the number of their occurrences in the result set of query expression `expr`. - >>> dj.U(attr1,attr2).aggr(expr, n='count(*)') + >>> dj.U('attr1', 'attr2').aggr(expr, n='count(*)') - Joins: + Homology: - If expression `expr` has attributes 'attr1' and 'attr2', then expr * dj.U('attr1','attr2') yields the same result - as `expr` but `attr1` and `attr2` are promoted to the the primary key. This is useful for producing a join on - non-primary key attributes. - For example, if `attr` is in both expr1 and expr2 but not in their primary keys, then expr1 * expr2 will throw - an error because in most cases, it does not make sense to join on non-primary key attributes and users must first - rename `attr` in one of the operands. The expression dj.U('attr') * rel1 * rel2 overrides this constraint. + Since dj.U conceptually contains all possible lineages, its attributes are homologous to + any namesake attribute in other expressions. This makes dj.U always compatible for + semantic matching in joins and restrictions. """ def __init__(self, *primary_key): @@ -826,8 +842,22 @@ def join(self, other, left=False): return result def __mul__(self, other): - """shorthand for join""" - return self.join(other) + """ + dj.U * table is deprecated in DataJoint 2.0. + Use dj.U & table instead. + """ + raise DataJointError( + "dj.U(...) * table is deprecated in DataJoint 2.0. " + "Use dj.U(...) & table instead." + ) + + def __sub__(self, other): + """ + dj.U - table produces an infinite set and is not supported. + """ + raise DataJointError( + "dj.U(...) - table produces an infinite set and is not supported." + ) def aggr(self, group, **named_attributes): """ diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index ecf2d6ba0..dc305db71 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -7,6 +7,7 @@ from .attribute_adapter import get_adapter from .attribute_type import AttributeType +from .lineage import get_all_lineages from .declare import ( EXTERNAL_TYPES, NATIVE_TYPES, @@ -73,6 +74,7 @@ def decode(self, stored, *, key=None): attribute_expression=None, database=None, dtype=object, + lineage=None, # Origin of attribute: "schema.table.attribute" or None for native secondary ) @@ -406,6 +408,11 @@ def _init_from_database(self): # restore adapted type name attr["type"] = adapter_name + # Load lineage data from ~lineage table + lineages = get_all_lineages(conn, database, table_name) + for attr in attributes: + attr["lineage"] = lineages.get(attr["name"]) + self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes)) # Read and tabulate secondary indexes diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py new file mode 100644 index 000000000..50ce33a4d --- /dev/null +++ b/src/datajoint/lineage.py @@ -0,0 +1,177 @@ +""" +Lineage tracking for semantic matching in joins. + +Lineage identifies the origin of an attribute - where it was first defined. +It is represented as a string in the format: "schema.table.attribute" + +Only attributes WITH lineage are stored in the ~lineage table: +- Native primary key attributes: lineage is this table +- FK-inherited attributes: lineage is traced to the origin +- Native secondary attributes: no lineage (no entry in table) +""" + +import logging + +logger = logging.getLogger(__name__.split(".")[0]) + +LINEAGE_TABLE_NAME = "~lineage" + + +def _lineage_table_sql(database): + """Generate SQL to create the ~lineage table.""" + return f""" + CREATE TABLE IF NOT EXISTS `{database}`.`{LINEAGE_TABLE_NAME}` ( + table_name VARCHAR(64) NOT NULL, + attribute_name VARCHAR(64) NOT NULL, + lineage VARCHAR(255) NOT NULL, + PRIMARY KEY (table_name, attribute_name) + ) ENGINE=InnoDB + """ + + +def ensure_lineage_table(connection, database): + """Create the ~lineage table if it doesn't exist.""" + connection.query(_lineage_table_sql(database)) + + +def lineage_table_exists(connection, database): + """Check if the ~lineage table exists in the schema.""" + result = connection.query( + """ + SELECT COUNT(*) FROM information_schema.tables + WHERE table_schema = %s AND table_name = %s + """, + args=(database, LINEAGE_TABLE_NAME), + ) + return result.fetchone()[0] > 0 + + +def get_lineage(connection, database, table_name, attribute_name): + """ + Get lineage for an attribute from the ~lineage table. + + Returns the lineage string if found, None otherwise (indicating no lineage + or attribute is a native secondary). + """ + if not lineage_table_exists(connection, database): + return None + + result = connection.query( + f""" + SELECT lineage FROM `{database}`.`{LINEAGE_TABLE_NAME}` + WHERE table_name = %s AND attribute_name = %s + """, + args=(table_name, attribute_name), + ) + row = result.fetchone() + return row[0] if row else None + + +def get_all_lineages(connection, database, table_name): + """ + Get all lineage entries for a table. + + Returns a dict mapping attribute_name -> lineage. + Attributes not in the dict have no lineage (native secondary). + """ + if not lineage_table_exists(connection, database): + return {} + + result = connection.query( + f""" + SELECT attribute_name, lineage FROM `{database}`.`{LINEAGE_TABLE_NAME}` + WHERE table_name = %s + """, + args=(table_name,), + ) + return {row[0]: row[1] for row in result} + + +def delete_lineage_entries(connection, database, table_name): + """Delete all lineage entries for a table.""" + if not lineage_table_exists(connection, database): + return + + connection.query( + f""" + DELETE FROM `{database}`.`{LINEAGE_TABLE_NAME}` + WHERE table_name = %s + """, + args=(table_name,), + ) + + +def insert_lineage_entries(connection, database, entries): + """ + Insert lineage entries for a table. + + :param entries: list of (table_name, attribute_name, lineage) tuples + """ + if not entries: + return + + ensure_lineage_table(connection, database) + + # Use INSERT ... ON DUPLICATE KEY UPDATE to handle re-declarations + for table_name, attribute_name, lineage in entries: + connection.query( + f""" + INSERT INTO `{database}`.`{LINEAGE_TABLE_NAME}` + (table_name, attribute_name, lineage) + VALUES (%s, %s, %s) + ON DUPLICATE KEY UPDATE lineage = VALUES(lineage) + """, + args=(table_name, attribute_name, lineage), + ) + + +def compute_lineage_from_dependencies(connection, full_table_name, attribute_name, primary_key): + """ + Compute lineage by traversing FK relationships. + + Fallback method when ~lineage table doesn't exist. + + :param connection: database connection + :param full_table_name: fully qualified table name like `schema`.`table` + :param attribute_name: the attribute to compute lineage for + :param primary_key: list of primary key attribute names for this table + :return: lineage string or None + """ + connection.dependencies.load(force=False) + + # Parse database and table name + parts = full_table_name.replace("`", "").split(".") + database = parts[0] + table_name = parts[1] + + # Check if attribute is inherited via FK + parents = connection.dependencies.parents(full_table_name) + for parent_table, props in parents.items(): + # Skip alias nodes (numeric strings) + if parent_table.isdigit(): + # Get the actual parent through the alias + grandparents = connection.dependencies.parents(parent_table) + if grandparents: + parent_table, props = next(iter(grandparents.items())) + + attr_map = props.get("attr_map", {}) + if attribute_name in attr_map: + parent_attr = attr_map[attribute_name] + parent_parts = parent_table.replace("`", "").split(".") + parent_db = parent_parts[0] + parent_tbl = parent_parts[1] + + # Get parent's primary key + parent_pk = connection.dependencies.nodes.get(parent_table, {}).get("primary_key", set()) + + # Recursively trace to origin + return compute_lineage_from_dependencies( + connection, parent_table, parent_attr, list(parent_pk) + ) + + # Not inherited - check if primary key + if attribute_name in primary_key: + return f"{database}.{table_name}.{attribute_name}" + + # Native secondary - no lineage + return None diff --git a/src/datajoint/table.py b/src/datajoint/table.py index d94dfd66f..5cedacfdc 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -16,6 +16,11 @@ from .condition import make_condition from .declare import alter, declare +from .lineage import ( + compute_lineage_from_dependencies, + delete_lineage_entries, + insert_lineage_entries, +) from .errors import ( AccessError, DataJointError, @@ -114,6 +119,8 @@ def declare(self, context=None): pass else: self._log("Declared " + self.full_table_name) + # Populate lineage entries for semantic matching + self._populate_lineage() def alter(self, prompt=True, context=None): """ @@ -148,6 +155,34 @@ def alter(self, prompt=True, context=None): logger.info("Table altered") self._log("Altered " + self.full_table_name) + def _populate_lineage(self): + """ + Populate lineage entries for this table in the ~lineage table. + + Called after table declaration to enable semantic matching in joins. + """ + # Force reload dependencies to include the newly declared table + self.connection.dependencies.load(force=True) + + # Remove any leftover entries from previous declaration + delete_lineage_entries(self.connection, self.database, self.table_name) + + # Compute lineage for each attribute and collect entries + lineage_entries = [] + for attr_name in self.heading.names: + lineage = compute_lineage_from_dependencies( + self.connection, + self.full_table_name, + attr_name, + self.heading.primary_key, + ) + if lineage: # Only store attributes with lineage + lineage_entries.append((self.table_name, attr_name, lineage)) + + # Insert entries + if lineage_entries: + insert_lineage_entries(self.connection, self.database, lineage_entries) + def from_clause(self): """ :return: the FROM clause of SQL SELECT statements. @@ -750,6 +785,8 @@ def drop_quick(self): if self.is_declared: query = "DROP TABLE %s" % self.full_table_name self.connection.query(query) + # Clean up lineage entries + delete_lineage_entries(self.connection, self.database, self.table_name) logger.info("Dropped table %s" % self.full_table_name) self._log(query[:255]) else: diff --git a/tests/test_semantic_matching.py b/tests/test_semantic_matching.py new file mode 100644 index 000000000..b7278b063 --- /dev/null +++ b/tests/test_semantic_matching.py @@ -0,0 +1,334 @@ +""" +Tests for semantic matching in joins. + +Semantic matching ensures that attributes are only matched when they share +both the same name AND the same lineage (origin). +""" + +import pytest + +import datajoint as dj +from datajoint.errors import DataJointError + + +@pytest.fixture +def schema_lineage(connection): + """ + Create a schema with tables for testing lineage tracking and semantic matching. + """ + schema = dj.Schema("test_lineage", connection=connection, create_schema=True) + + @schema + class Person(dj.Manual): + definition = """ + person_id : int + --- + name : varchar(100) + """ + + @schema + class Course(dj.Manual): + definition = """ + course_id : int + --- + title : varchar(100) + """ + + @schema + class Student(dj.Manual): + definition = """ + -> Person + --- + enrollment_year : int + """ + + @schema + class Instructor(dj.Manual): + definition = """ + -> Person + --- + department : varchar(100) + """ + + @schema + class Enrollment(dj.Manual): + definition = """ + -> Student + -> Course + --- + grade : varchar(2) + """ + + @schema + class Teaching(dj.Manual): + definition = """ + -> Instructor + -> Course + --- + semester : varchar(20) + """ + + # Tables with non-homologous namesakes (different lineages for same name) + @schema + class TableWithId1(dj.Manual): + definition = """ + id : int # native PK - lineage is this table + --- + value1 : int + """ + + @schema + class TableWithId2(dj.Manual): + definition = """ + id : int # native PK - lineage is this table (different from TableWithId1) + --- + value2 : int + """ + + # Insert test data + Person.insert( + [ + {"person_id": 1, "name": "Alice"}, + {"person_id": 2, "name": "Bob"}, + {"person_id": 3, "name": "Charlie"}, + ], + skip_duplicates=True, + ) + Course.insert( + [ + {"course_id": 101, "title": "Math"}, + {"course_id": 102, "title": "Physics"}, + ], + skip_duplicates=True, + ) + Student.insert( + [ + {"person_id": 1, "enrollment_year": 2020}, + {"person_id": 2, "enrollment_year": 2021}, + ], + skip_duplicates=True, + ) + Instructor.insert( + [ + {"person_id": 3, "department": "Science"}, + ], + skip_duplicates=True, + ) + Enrollment.insert( + [ + {"person_id": 1, "course_id": 101, "grade": "A"}, + {"person_id": 1, "course_id": 102, "grade": "B"}, + {"person_id": 2, "course_id": 101, "grade": "B"}, + ], + skip_duplicates=True, + ) + Teaching.insert( + [ + {"person_id": 3, "course_id": 101, "semester": "Fall 2023"}, + {"person_id": 3, "course_id": 102, "semester": "Spring 2024"}, + ], + skip_duplicates=True, + ) + TableWithId1.insert( + [{"id": 1, "value1": 10}, {"id": 2, "value1": 20}], + skip_duplicates=True, + ) + TableWithId2.insert( + [{"id": 1, "value2": 100}, {"id": 2, "value2": 200}], + skip_duplicates=True, + ) + + yield { + "schema": schema, + "Person": Person, + "Course": Course, + "Student": Student, + "Instructor": Instructor, + "Enrollment": Enrollment, + "Teaching": Teaching, + "TableWithId1": TableWithId1, + "TableWithId2": TableWithId2, + } + + schema.drop(force=True) + + +class TestLineageTracking: + """Test that lineage is correctly tracked for attributes.""" + + def test_native_pk_has_lineage(self, schema_lineage): + """Native primary key attributes should have lineage pointing to their table.""" + Person = schema_lineage["Person"] + lineage = Person.heading["person_id"].lineage + assert lineage is not None + assert "person" in lineage + assert "person_id" in lineage + + def test_fk_inherited_has_parent_lineage(self, schema_lineage): + """FK-inherited attributes should have lineage tracing to their origin.""" + Student = schema_lineage["Student"] + # person_id is inherited from Person + lineage = Student.heading["person_id"].lineage + assert lineage is not None + assert "person" in lineage # Should trace to Person table + assert "person_id" in lineage + + def test_native_secondary_has_no_lineage(self, schema_lineage): + """Native secondary attributes should have no lineage.""" + Person = schema_lineage["Person"] + lineage = Person.heading["name"].lineage + assert lineage is None + + def test_fk_chain_preserves_lineage(self, schema_lineage): + """Lineage should be preserved through FK chains.""" + Enrollment = schema_lineage["Enrollment"] + # person_id traces through Student -> Person + lineage = Enrollment.heading["person_id"].lineage + assert lineage is not None + assert "person" in lineage + + +class TestHomologousNamesakes: + """Test that homologous namesakes (same name AND same lineage) are correctly matched.""" + + def test_join_on_homologous_namesakes(self, schema_lineage): + """Tables with shared FK origin should join on that attribute.""" + Student = schema_lineage["Student"] + Enrollment = schema_lineage["Enrollment"] + + # Both have person_id with same lineage (from Person) + result = Student * Enrollment + assert len(result) > 0 + assert "person_id" in result.heading.names + + def test_multi_table_join_with_shared_fk(self, schema_lineage): + """Multiple tables sharing same FK origin should join correctly.""" + Enrollment = schema_lineage["Enrollment"] + Teaching = schema_lineage["Teaching"] + + # Both have course_id with same lineage (from Course) + result = Enrollment * Teaching + assert len(result) > 0 + assert "course_id" in result.heading.names + + +class TestNonHomologousNamesakes: + """Test that non-homologous namesakes (same name, different lineage) raise errors.""" + + def test_different_pk_origins_raise_error(self, schema_lineage): + """Tables with same-named PKs but different origins should raise error.""" + TableWithId1 = schema_lineage["TableWithId1"] + TableWithId2 = schema_lineage["TableWithId2"] + + # Both have 'id' but with different lineages + with pytest.raises(DataJointError) as exc_info: + TableWithId1 * TableWithId2 + + assert "different lineages" in str(exc_info.value) + assert "id" in str(exc_info.value) + + def test_semantic_check_false_bypasses_error(self, schema_lineage): + """semantic_check=False should bypass the lineage check.""" + TableWithId1 = schema_lineage["TableWithId1"] + TableWithId2 = schema_lineage["TableWithId2"] + + # Should not raise with semantic_check=False + result = TableWithId1().join(TableWithId2(), semantic_check=False) + assert result is not None + + +class TestDeprecatedOperators: + """Test that deprecated operators raise appropriate errors.""" + + def test_matmul_operator_removed(self, schema_lineage): + """The @ operator should raise an error directing to .join().""" + Person = schema_lineage["Person"] + Student = schema_lineage["Student"] + + with pytest.raises(DataJointError) as exc_info: + Person @ Student + + assert "@ operator has been removed" in str(exc_info.value) + assert ".join" in str(exc_info.value) + assert "semantic_check=False" in str(exc_info.value) + + def test_u_mul_deprecated(self, schema_lineage): + """dj.U * table should raise deprecation error.""" + Person = schema_lineage["Person"] + + with pytest.raises(DataJointError) as exc_info: + dj.U("person_id") * Person + + assert "deprecated" in str(exc_info.value).lower() + assert "&" in str(exc_info.value) + + def test_u_sub_not_supported(self, schema_lineage): + """dj.U - table should raise error (infinite set).""" + Person = schema_lineage["Person"] + + with pytest.raises(DataJointError) as exc_info: + dj.U("person_id") - Person + + assert "infinite" in str(exc_info.value).lower() + + +class TestUniversalSet: + """Test dj.U operations with semantic matching.""" + + def test_u_restriction_works(self, schema_lineage): + """dj.U & table should work correctly.""" + Person = schema_lineage["Person"] + + result = dj.U("person_id") & Person + assert len(result) == len(Person) + assert "person_id" in result.primary_key + + def test_u_empty_restriction(self, schema_lineage): + """dj.U() & table should return distinct primary keys.""" + Person = schema_lineage["Person"] + + result = dj.U() & Person + assert len(result) == len(Person) + + def test_u_aggr_works(self, schema_lineage): + """dj.U().aggr() should work correctly.""" + Person = schema_lineage["Person"] + + result = dj.U().aggr(Person, n="count(*)") + assert len(result) == 1 + row = result.fetch1() + assert row["n"] == len(Person) + + def test_u_is_always_compatible(self, schema_lineage): + """dj.U should be compatible with any expression (contains all lineages).""" + TableWithId1 = schema_lineage["TableWithId1"] + + # U should be compatible even with tables that have unique lineages + result = dj.U("id") & TableWithId1 + assert len(result) > 0 + + +class TestLineageInProjection: + """Test that lineage is preserved correctly in projection operations.""" + + def test_included_attrs_preserve_lineage(self, schema_lineage): + """Projected attributes should preserve their lineage.""" + Student = schema_lineage["Student"] + + projected = Student.proj() + assert projected.heading["person_id"].lineage == Student.heading["person_id"].lineage + + def test_renamed_attrs_preserve_lineage(self, schema_lineage): + """Renamed attributes should preserve their original lineage.""" + Student = schema_lineage["Student"] + + renamed = Student.proj(pid="person_id") + # The renamed attribute should have the same lineage as the original + assert renamed.heading["pid"].lineage == Student.heading["person_id"].lineage + + def test_computed_attrs_have_no_lineage(self, schema_lineage): + """Computed attributes should have no lineage.""" + Student = schema_lineage["Student"] + + computed = Student.proj(doubled="enrollment_year * 2") + assert computed.heading["doubled"].lineage is None From fe58e002fa2a7875a9d41d2a7379ac1779783d25 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:18:55 +0000 Subject: [PATCH 06/15] Deprecate dj.U join instead of redirecting to restrict - dj.U * table now raises a deprecation error instead of redirecting to dj.U & table (they are different operations) - Remove unused variables in lineage.py --- src/datajoint/expression.py | 20 ++++++-------------- src/datajoint/lineage.py | 7 +------ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index 69b92163d..bf168a930 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -279,8 +279,7 @@ def __matmul__(self, other): Use .join(other, semantic_check=False) for permissive joins. """ raise DataJointError( - "The @ operator has been removed in DataJoint 2.0. " - "Use .join(other, semantic_check=False) for permissive joins." + "The @ operator has been removed in DataJoint 2.0. " "Use .join(other, semantic_check=False) for permissive joins." ) def join(self, other, semantic_check=True, left=False): @@ -299,9 +298,9 @@ def join(self, other, semantic_check=True, left=False): a * b is short for a.join(b) a.join(b, semantic_check=False) for permissive joins """ - # Handle U objects: redirect to U's restriction operation + # U joins are deprecated - raise error directing to use & instead if isinstance(other, U): - return other & self + raise DataJointError("dj.U(...) * table is deprecated in DataJoint 2.0. " "Use dj.U(...) & table instead.") if inspect.isclass(other) and issubclass(other, QueryExpression): other = other() # instantiate if not isinstance(other, QueryExpression): @@ -310,9 +309,7 @@ def join(self, other, semantic_check=True, left=False): assert_join_compatibility(self, other) # Only join on homologous namesakes (same name AND same lineage) join_attributes = set( - n - for n in self.heading.names - if n in other.heading.names and self.heading[n].lineage == other.heading[n].lineage + n for n in self.heading.names if n in other.heading.names and self.heading[n].lineage == other.heading[n].lineage ) # needs subquery if self's FROM clause has common attributes with other's FROM clause need_subquery1 = need_subquery2 = bool( @@ -846,18 +843,13 @@ def __mul__(self, other): dj.U * table is deprecated in DataJoint 2.0. Use dj.U & table instead. """ - raise DataJointError( - "dj.U(...) * table is deprecated in DataJoint 2.0. " - "Use dj.U(...) & table instead." - ) + raise DataJointError("dj.U(...) * table is deprecated in DataJoint 2.0. " "Use dj.U(...) & table instead.") def __sub__(self, other): """ dj.U - table produces an infinite set and is not supported. """ - raise DataJointError( - "dj.U(...) - table produces an infinite set and is not supported." - ) + raise DataJointError("dj.U(...) - table produces an infinite set and is not supported.") def aggr(self, group, **named_attributes): """ diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index 50ce33a4d..d3e96b007 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -157,17 +157,12 @@ def compute_lineage_from_dependencies(connection, full_table_name, attribute_nam attr_map = props.get("attr_map", {}) if attribute_name in attr_map: parent_attr = attr_map[attribute_name] - parent_parts = parent_table.replace("`", "").split(".") - parent_db = parent_parts[0] - parent_tbl = parent_parts[1] # Get parent's primary key parent_pk = connection.dependencies.nodes.get(parent_table, {}).get("primary_key", set()) # Recursively trace to origin - return compute_lineage_from_dependencies( - connection, parent_table, parent_attr, list(parent_pk) - ) + return compute_lineage_from_dependencies(connection, parent_table, parent_attr, list(parent_pk)) # Not inherited - check if primary key if attribute_name in primary_key: From 097cb3918c1dff8c8365aeee420e706cf631a4a4 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:19:30 +0000 Subject: [PATCH 07/15] Combine _lineage_table_sql into ensure_lineage_table --- src/datajoint/lineage.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index d3e96b007..0db902900 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -17,21 +17,18 @@ LINEAGE_TABLE_NAME = "~lineage" -def _lineage_table_sql(database): - """Generate SQL to create the ~lineage table.""" - return f""" +def ensure_lineage_table(connection, database): + """Create the ~lineage table if it doesn't exist.""" + connection.query( + f""" CREATE TABLE IF NOT EXISTS `{database}`.`{LINEAGE_TABLE_NAME}` ( table_name VARCHAR(64) NOT NULL, attribute_name VARCHAR(64) NOT NULL, lineage VARCHAR(255) NOT NULL, PRIMARY KEY (table_name, attribute_name) ) ENGINE=InnoDB - """ - - -def ensure_lineage_table(connection, database): - """Create the ~lineage table if it doesn't exist.""" - connection.query(_lineage_table_sql(database)) + """ + ) def lineage_table_exists(connection, database): From b568997ae8dd0614163e35ed1bd81b86e9f2af2a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:28:20 +0000 Subject: [PATCH 08/15] Use exception handling instead of table existence check - Remove lineage_table_exists function - Catch exceptions in get_lineage, get_all_lineages, delete_lineage_entries - Single round trip to database, no race conditions --- docs/src/design/semantic-matching-spec.md | 4 +- src/datajoint/lineage.py | 77 ++++++++++------------- 2 files changed, 36 insertions(+), 45 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index bd698e250..215004ea9 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -260,10 +260,10 @@ These methods are **mutually exclusive**: ```python def get_lineage(schema, table, attribute): - if lineage_table_exists(schema): + try: # Returns lineage string if entry exists, None otherwise return query_lineage_table(schema, table, attribute) - else: + except TableDoesNotExist: return compute_from_dependencies(schema, table, attribute) ``` diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index 0db902900..155bada31 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -31,18 +31,6 @@ def ensure_lineage_table(connection, database): ) -def lineage_table_exists(connection, database): - """Check if the ~lineage table exists in the schema.""" - result = connection.query( - """ - SELECT COUNT(*) FROM information_schema.tables - WHERE table_schema = %s AND table_name = %s - """, - args=(database, LINEAGE_TABLE_NAME), - ) - return result.fetchone()[0] > 0 - - def get_lineage(connection, database, table_name, attribute_name): """ Get lineage for an attribute from the ~lineage table. @@ -50,19 +38,20 @@ def get_lineage(connection, database, table_name, attribute_name): Returns the lineage string if found, None otherwise (indicating no lineage or attribute is a native secondary). """ - if not lineage_table_exists(connection, database): + try: + result = connection.query( + f""" + SELECT lineage FROM `{database}`.`{LINEAGE_TABLE_NAME}` + WHERE table_name = %s AND attribute_name = %s + """, + args=(table_name, attribute_name), + ) + row = result.fetchone() + return row[0] if row else None + except Exception: + # Table doesn't exist yet return None - result = connection.query( - f""" - SELECT lineage FROM `{database}`.`{LINEAGE_TABLE_NAME}` - WHERE table_name = %s AND attribute_name = %s - """, - args=(table_name, attribute_name), - ) - row = result.fetchone() - return row[0] if row else None - def get_all_lineages(connection, database, table_name): """ @@ -71,31 +60,33 @@ def get_all_lineages(connection, database, table_name): Returns a dict mapping attribute_name -> lineage. Attributes not in the dict have no lineage (native secondary). """ - if not lineage_table_exists(connection, database): + try: + result = connection.query( + f""" + SELECT attribute_name, lineage FROM `{database}`.`{LINEAGE_TABLE_NAME}` + WHERE table_name = %s + """, + args=(table_name,), + ) + return {row[0]: row[1] for row in result} + except Exception: + # Table doesn't exist yet return {} - result = connection.query( - f""" - SELECT attribute_name, lineage FROM `{database}`.`{LINEAGE_TABLE_NAME}` - WHERE table_name = %s - """, - args=(table_name,), - ) - return {row[0]: row[1] for row in result} - def delete_lineage_entries(connection, database, table_name): """Delete all lineage entries for a table.""" - if not lineage_table_exists(connection, database): - return - - connection.query( - f""" - DELETE FROM `{database}`.`{LINEAGE_TABLE_NAME}` - WHERE table_name = %s - """, - args=(table_name,), - ) + try: + connection.query( + f""" + DELETE FROM `{database}`.`{LINEAGE_TABLE_NAME}` + WHERE table_name = %s + """, + args=(table_name,), + ) + except Exception: + # Table doesn't exist yet - nothing to delete + pass def insert_lineage_entries(connection, database, entries): From 036e0f23cd443905ab6e1cb1f7896b171d44d3af Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:33:12 +0000 Subject: [PATCH 09/15] Remove unnecessary ON DUPLICATE KEY UPDATE in insert_lineage_entries --- src/datajoint/lineage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index 155bada31..c6f84ec56 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -100,14 +100,12 @@ def insert_lineage_entries(connection, database, entries): ensure_lineage_table(connection, database) - # Use INSERT ... ON DUPLICATE KEY UPDATE to handle re-declarations for table_name, attribute_name, lineage in entries: connection.query( f""" INSERT INTO `{database}`.`{LINEAGE_TABLE_NAME}` (table_name, attribute_name, lineage) VALUES (%s, %s, %s) - ON DUPLICATE KEY UPDATE lineage = VALUES(lineage) """, args=(table_name, attribute_name, lineage), ) From 5606348c9e8eed84e1a1184d25d36fa2555fafab Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:36:21 +0000 Subject: [PATCH 10/15] Catch MissingTableError instead of bare Exception --- src/datajoint/lineage.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index c6f84ec56..7e2e44c29 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -12,6 +12,8 @@ import logging +from .errors import MissingTableError + logger = logging.getLogger(__name__.split(".")[0]) LINEAGE_TABLE_NAME = "~lineage" @@ -48,8 +50,8 @@ def get_lineage(connection, database, table_name, attribute_name): ) row = result.fetchone() return row[0] if row else None - except Exception: - # Table doesn't exist yet + except MissingTableError: + # ~lineage table doesn't exist yet return None @@ -69,8 +71,8 @@ def get_all_lineages(connection, database, table_name): args=(table_name,), ) return {row[0]: row[1] for row in result} - except Exception: - # Table doesn't exist yet + except MissingTableError: + # ~lineage table doesn't exist yet return {} @@ -84,8 +86,8 @@ def delete_lineage_entries(connection, database, table_name): """, args=(table_name,), ) - except Exception: - # Table doesn't exist yet - nothing to delete + except MissingTableError: + # ~lineage table doesn't exist yet - nothing to delete pass From 6c6362524c5684d7036d7292059664da37a351c1 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 20:48:17 +0000 Subject: [PATCH 11/15] Rename ensure_lineage_table to create_lineage_table Also: - Remove redundant U check for second operand in assert_join_compatibility - Suggest .join(semantic_check=False) in error message for non-homologous namesakes --- src/datajoint/condition.py | 5 +++-- src/datajoint/lineage.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/datajoint/condition.py b/src/datajoint/condition.py index 79c8e69be..8830a3850 100644 --- a/src/datajoint/condition.py +++ b/src/datajoint/condition.py @@ -115,7 +115,7 @@ def assert_join_compatibility(expr1, expr2): raise DataJointError("Object %r is not a QueryExpression and cannot be joined." % rel) # dj.U is always compatible - it contains all possible lineages - if isinstance(expr1, U) or isinstance(expr2, U): + if isinstance(expr1, U): return # Find namesake attributes (same name in both expressions) @@ -130,7 +130,8 @@ def assert_join_compatibility(expr1, expr2): raise DataJointError( f"Cannot join on attribute `{name}`: different lineages " f"({lineage1} vs {lineage2}). " - f"Use .proj() to rename one of the attributes." + f"Use .proj() to rename one of the attributes or " + f".join(semantic_check=False) to force a natural join." ) diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index 7e2e44c29..893892b45 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -19,7 +19,7 @@ LINEAGE_TABLE_NAME = "~lineage" -def ensure_lineage_table(connection, database): +def create_lineage_table(connection, database): """Create the ~lineage table if it doesn't exist.""" connection.query( f""" @@ -100,7 +100,7 @@ def insert_lineage_entries(connection, database, entries): if not entries: return - ensure_lineage_table(connection, database) + create_lineage_table(connection, database) for table_name, attribute_name, lineage in entries: connection.query( From e64e7a0ee202f18694767dd8ebe7cc16d6565328 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 24 Dec 2025 23:56:04 +0000 Subject: [PATCH 12/15] Update documentation and docstrings for semantic matching - Update spec document to match actual implementation: - assert_join_compatibility only checks if expr1 is U (not expr2) - Error message includes .join(semantic_check=False) suggestion - Use create_lineage_table function name - Add complete parameter documentation to all lineage.py functions --- docs/src/design/semantic-matching-spec.md | 9 ++++--- src/datajoint/lineage.py | 30 ++++++++++++++++++----- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index 215004ea9..a563249db 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -306,7 +306,7 @@ def assert_join_compatibility(expr1, expr2): Raises DataJointError if non-homologous namesakes are detected. """ - if isinstance(expr1, U) or isinstance(expr2, U): + if isinstance(expr1, U): return # U is always compatible # Find namesake attributes (same name in both) @@ -320,7 +320,8 @@ def assert_join_compatibility(expr1, expr2): raise DataJointError( f"Cannot join on attribute `{name}`: " f"different lineages ({lineage1} vs {lineage2}). " - f"Use .proj() to rename one of the attributes." + f"Use .proj() to rename one of the attributes or " + f".join(semantic_check=False) to force a natural join." ) ``` @@ -415,7 +416,7 @@ For existing schemas without `~lineage` tables: ```python def migrate_schema_lineage(schema): """Populate ~lineage table for all tables in schema.""" - create_lineage_table_if_not_exists(schema) + create_lineage_table(schema) for table in schema.list_tables(): populate_lineage_from_dependencies(schema, table) @@ -452,7 +453,7 @@ Union requires all namesake attributes to have matching lineage (enforced via `a ``` DataJointError: Cannot join on attribute `id`: different lineages (university.student.id vs university.course.id). -Use .proj() to rename one of the attributes. +Use .proj() to rename one of the attributes or .join(semantic_check=False) to force a natural join. ``` ### Deprecated `@` Operator diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index 893892b45..efa5fee3c 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -20,7 +20,12 @@ def create_lineage_table(connection, database): - """Create the ~lineage table if it doesn't exist.""" + """ + Create the ~lineage table if it doesn't exist. + + :param connection: database connection + :param database: schema/database name + """ connection.query( f""" CREATE TABLE IF NOT EXISTS `{database}`.`{LINEAGE_TABLE_NAME}` ( @@ -37,8 +42,11 @@ def get_lineage(connection, database, table_name, attribute_name): """ Get lineage for an attribute from the ~lineage table. - Returns the lineage string if found, None otherwise (indicating no lineage - or attribute is a native secondary). + :param connection: database connection + :param database: schema/database name + :param table_name: name of the table + :param attribute_name: name of the attribute + :return: lineage string if found, None otherwise (no lineage or native secondary) """ try: result = connection.query( @@ -59,8 +67,10 @@ def get_all_lineages(connection, database, table_name): """ Get all lineage entries for a table. - Returns a dict mapping attribute_name -> lineage. - Attributes not in the dict have no lineage (native secondary). + :param connection: database connection + :param database: schema/database name + :param table_name: name of the table + :return: dict mapping attribute_name -> lineage (attributes not in dict have no lineage) """ try: result = connection.query( @@ -77,7 +87,13 @@ def get_all_lineages(connection, database, table_name): def delete_lineage_entries(connection, database, table_name): - """Delete all lineage entries for a table.""" + """ + Delete all lineage entries for a table. + + :param connection: database connection + :param database: schema/database name + :param table_name: name of the table + """ try: connection.query( f""" @@ -95,6 +111,8 @@ def insert_lineage_entries(connection, database, entries): """ Insert lineage entries for a table. + :param connection: database connection + :param database: schema/database name :param entries: list of (table_name, attribute_name, lineage) tuples """ if not entries: From d68de161ad141c5cd207826a64dc471889de99de Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 01:10:51 +0000 Subject: [PATCH 13/15] Add functional dependency check for aggregation operator In A.aggr(B, ...), ensures every entry in B matches exactly one entry in A: - B must have all of A's primary key attributes - Primary key attributes must be homologous (same lineage) - Clear error messages for missing attributes or non-homologous lineage Updated docstrings for: - Aggregation.create() - QueryExpression.aggr() - U.aggr() Updated spec document with: - Functional dependency requirements - Error message examples - Additional test cases Co-authored-by: dimitri-yatsenko --- docs/src/design/semantic-matching-spec.md | 47 ++++++++++++++- src/datajoint/expression.py | 72 +++++++++++++++++++---- 2 files changed, 105 insertions(+), 14 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index a563249db..1cfa5d0de 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -136,7 +136,7 @@ Semantic matching applies to all binary operations that match attributes: | `A * B` | Join | Matches on homologous namesakes | | `A & B` | Restriction | Matches on homologous namesakes | | `A - B` | Anti-restriction | Matches on homologous namesakes | -| `A.aggr(B, ...)` | Aggregation | Matches on homologous namesakes | +| `A.aggr(B, ...)` | Aggregation | Requires functional dependency (see below) | ### The `.join()` Method @@ -438,8 +438,27 @@ def proj(self, *attributes, **named_attributes): ### Aggregation (`aggr`) -Aggregation creates a new expression with: -- Group attributes retain their lineage from the group operand +In `A.aggr(B, ...)`, entries from B are grouped by A's primary key and aggregate functions are computed. + +**Functional Dependency Requirement**: Every entry in B must match exactly one entry in A. This requires: + +1. **B must have all of A's primary key attributes**: If A's primary key is `(a, b)`, then B must contain attributes named `a` and `b`. + +2. **Primary key attributes must be homologous**: The namesake attributes in B must have the same lineage as in A. This ensures they represent the same entity. + +```python +# Valid: Session.aggr(Trial, ...) where Trial has session_id from Session +Session.aggr(Trial, n='count(*)') # OK - Trial.session_id traces to Session.session_id + +# Invalid: Missing primary key attribute +Session.aggr(Stimulus, n='count(*)') # Error if Stimulus lacks session_id + +# Invalid: Non-homologous primary key +TableA.aggr(TableB, n='count(*)') # Error if TableB.id has different lineage than TableA.id +``` + +**Result lineage**: +- Group attributes retain their lineage from the grouping expression (A) - Aggregated attributes have `lineage=None` (they are computations) ### Union (`+`) @@ -470,6 +489,22 @@ DataJointError: dj.U(...) * table is deprecated in DataJoint 2.0. Use dj.U(...) & table instead. ``` +### Aggregation Missing Primary Key + +``` +DataJointError: Aggregation requires functional dependency: `group` must have all primary key +attributes of the grouping expression. Missing: {'session_id'}. +Use .proj() to add the missing attributes or verify the schema design. +``` + +### Aggregation Non-Homologous Primary Key + +``` +DataJointError: Aggregation requires homologous primary key attributes. +Attribute `id` has different lineages: university.student.id (grouping) vs university.course.id (group). +Use .proj() to rename one of the attributes or .join(semantic_check=False) in a manual aggregation. +``` + ## Testing Strategy ### Unit Tests @@ -496,6 +531,12 @@ Use dj.U(...) & table instead. - `dj.U - table` raises error - `dj.U * table` raises deprecation error +5. **Aggregation functional dependency**: + - `A.aggr(B)` works when B has all of A's PK attributes with same lineage + - `A.aggr(B)` raises error when B is missing PK attributes + - `A.aggr(B)` raises error when PK attributes have different lineage + - `dj.U('a', 'b').aggr(B)` works when B has `a` and `b` attributes + ### Integration Tests 1. **Schema migration**: Existing schema gets `~lineage` table populated correctly diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index bf168a930..edf997312 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -464,13 +464,20 @@ def proj(self, *attributes, **named_attributes): def aggr(self, group, *attributes, keep_all_rows=False, **named_attributes): """ - Aggregation of the type U('attr1','attr2').aggr(group, computation="QueryExpression") - has the primary key ('attr1','attr2') and performs aggregation computations for all matching elements of `group`. + Aggregate `group` over the primary key of `self`. - :param group: The query expression to be aggregated. - :param keep_all_rows: True=keep all the rows from self. False=keep only rows that match entries in group. - :param named_attributes: computations of the form new_attribute="sql expression on attributes of group" - :return: The derived query expression + In A.aggr(B, ...), groups entries from B by the primary key of A and computes + aggregate functions. Requires functional dependency: every entry in B must match + exactly one entry in A. This means B must have all of A's primary key attributes + as homologous namesakes (same name AND same lineage). + + :param group: the query expression to aggregate (B in A.aggr(B)) + :param attributes: attributes from self to include in the result + :param keep_all_rows: True=keep all rows from self (left join). False=keep only matching rows. + :param named_attributes: aggregation computations, e.g., count='count(*)', avg_val='avg(value)' + :return: query expression with self's primary key and the computed aggregations + :raises DataJointError: if group is missing primary key attributes from self, + or if namesake primary key attributes have different lineages """ if Ellipsis in attributes: # expand ellipsis to include only attributes from the left table @@ -631,9 +638,47 @@ class Aggregation(QueryExpression): @classmethod def create(cls, arg, group, keep_all_rows=False): + """ + Create an aggregation expression. + + For A.aggr(B, ...), ensures functional dependency: every entry in B must match + exactly one entry in A. This requires B to have all of A's primary key attributes + as homologous namesakes (same name AND same lineage). + + :param arg: the grouping expression (A in A.aggr(B)) + :param group: the expression to aggregate (B in A.aggr(B)) + :param keep_all_rows: if True, keep all rows from arg (left join behavior) + :raises DataJointError: if group is missing any primary key attributes from arg, + or if namesake attributes have different lineages + """ if inspect.isclass(group) and issubclass(group, QueryExpression): group = group() # instantiate if a class assert isinstance(group, QueryExpression) + + # Check functional dependency: group must have all of arg's primary key attributes + missing_pk = set(arg.primary_key) - set(group.heading.names) + if missing_pk: + raise DataJointError( + f"Aggregation requires functional dependency: `group` must have all primary key " + f"attributes of the grouping expression. Missing: {missing_pk}. " + f"Use .proj() to add the missing attributes or verify the schema design." + ) + + # Check that primary key attributes are homologous (same lineage) + # This is done for QueryExpression args; U is always compatible + if not isinstance(arg, U): + for attr_name in arg.primary_key: + arg_lineage = arg.heading[attr_name].lineage + group_lineage = group.heading[attr_name].lineage + if arg_lineage != group_lineage: + raise DataJointError( + f"Aggregation requires homologous primary key attributes. " + f"Attribute `{attr_name}` has different lineages: " + f"{arg_lineage} (grouping) vs {group_lineage} (group). " + f"Use .proj() to rename one of the attributes or " + f".join(semantic_check=False) in a manual aggregation." + ) + if keep_all_rows and len(group.support) > 1 or group.heading.new_attributes: group = group.make_subquery() # subquery if left joining a join join = arg.join(group, left=keep_all_rows) # reuse the join logic @@ -853,12 +898,17 @@ def __sub__(self, other): def aggr(self, group, **named_attributes): """ - Aggregation of the type U('attr1','attr2').aggr(group, computation="QueryExpression") - has the primary key ('attr1','attr2') and performs aggregation computations for all matching elements of `group`. + Aggregate `group` over the attributes of this universal set. + + In dj.U('attr1', 'attr2').aggr(B, ...), groups entries from B by attr1 and attr2 + and computes aggregate functions. Requires B to have all specified attributes. + Since dj.U is homologous to any namesake attribute, lineage compatibility is + always satisfied. - :param group: The query expression to be aggregated. - :param named_attributes: computations of the form new_attribute="sql expression on attributes of group" - :return: The derived query expression + :param group: the query expression to aggregate + :param named_attributes: aggregation computations, e.g., count='count(*)', avg_val='avg(value)' + :return: query expression with U's attributes as primary key and the computed aggregations + :raises DataJointError: if group is missing any of U's primary key attributes """ if named_attributes.get("keep_all_rows", False): raise DataJointError("Cannot set keep_all_rows=True when aggregating on a universal set.") From 844179375ed3ee0848275f8e163446a955e693e9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 01:13:30 +0000 Subject: [PATCH 14/15] Clarify that aggregation attributes can be primary or secondary in B In A.aggr(B), B must have all of A's primary key attributes, but these can be either primary or secondary attributes in B. Co-authored-by: dimitri-yatsenko --- docs/src/design/semantic-matching-spec.md | 4 ++-- src/datajoint/expression.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index 1cfa5d0de..7963eeeff 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -442,9 +442,9 @@ In `A.aggr(B, ...)`, entries from B are grouped by A's primary key and aggregate **Functional Dependency Requirement**: Every entry in B must match exactly one entry in A. This requires: -1. **B must have all of A's primary key attributes**: If A's primary key is `(a, b)`, then B must contain attributes named `a` and `b`. +1. **B must have all of A's primary key attributes**: If A's primary key is `(a, b)`, then B must contain attributes named `a` and `b`. These attributes can be either primary or secondary in B. -2. **Primary key attributes must be homologous**: The namesake attributes in B must have the same lineage as in A. This ensures they represent the same entity. +2. **These attributes must be homologous**: The namesake attributes in B must have the same lineage as in A. This ensures they represent the same entity. ```python # Valid: Session.aggr(Trial, ...) where Trial has session_id from Session diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index edf997312..62359be94 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -469,7 +469,8 @@ def aggr(self, group, *attributes, keep_all_rows=False, **named_attributes): In A.aggr(B, ...), groups entries from B by the primary key of A and computes aggregate functions. Requires functional dependency: every entry in B must match exactly one entry in A. This means B must have all of A's primary key attributes - as homologous namesakes (same name AND same lineage). + as homologous namesakes (same name AND same lineage). These attributes can be + either primary or secondary in B. :param group: the query expression to aggregate (B in A.aggr(B)) :param attributes: attributes from self to include in the result @@ -643,7 +644,8 @@ def create(cls, arg, group, keep_all_rows=False): For A.aggr(B, ...), ensures functional dependency: every entry in B must match exactly one entry in A. This requires B to have all of A's primary key attributes - as homologous namesakes (same name AND same lineage). + as homologous namesakes (same name AND same lineage). These attributes can be + either primary or secondary in B. :param arg: the grouping expression (A in A.aggr(B)) :param group: the expression to aggregate (B in A.aggr(B)) From 7c600b838b5d377090bbd486428d0bd2b5379c4c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 03:49:43 +0000 Subject: [PATCH 15/15] Deprecate ^ operator, add semantic_check to restrict() - Add semantic_check parameter to restrict() method (default True) - When semantic_check=False, wraps restriction in PromiscuousOperand - Deprecate ^ operator with error directing to .restrict(semantic_check=False) - Update spec document with .restrict() method and ^ deprecation - Add migration guide entry for ^ operator replacement Co-authored-by: dimitri-yatsenko --- docs/src/design/semantic-matching-spec.md | 46 ++++++++++++++++++++++- src/datajoint/expression.py | 25 ++++++++---- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md index 7963eeeff..876aeba46 100644 --- a/docs/src/design/semantic-matching-spec.md +++ b/docs/src/design/semantic-matching-spec.md @@ -164,6 +164,32 @@ A.join(B, semantic_check=False) # Explicit bypass The error message directs users to the explicit `.join()` method. +### The `.restrict()` Method + +The `.restrict()` method provides additional control for restrictions: + +```python +# Default: semantic checking enabled (same as &) +result = A.restrict(B) + +# Bypass semantic check for legacy compatibility +result = A.restrict(B, semantic_check=False) +``` + +### Removal of `^` Operator + +The `^` operator (permissive restriction) is **removed** in DataJoint 2.0: + +```python +# Old (deprecated): +A ^ B # Raises DataJointError with migration guidance + +# New: +A.restrict(B, semantic_check=False) # Explicit bypass +``` + +The error message directs users to the explicit `.restrict()` method. + ## Universal Set `dj.U` `dj.U()` or `dj.U('attr1', 'attr2', ...)` represents the universal set of all possible values and lineages. @@ -482,6 +508,13 @@ DataJointError: The @ operator has been removed in DataJoint 2.0. Use .join(other, semantic_check=False) for permissive joins. ``` +### Deprecated `^` Operator + +``` +DataJointError: The ^ operator has been removed in DataJoint 2.0. +Use .restrict(other, semantic_check=False) for permissive restrictions. +``` + ### Deprecated `dj.U * table` ``` @@ -562,7 +595,16 @@ Ensure existing well-designed schemas continue to work without modification. table1.join(table2, semantic_check=False) ``` -3. **Replace `dj.U * table`**: +3. **Replace `^` operator**: + ```python + # Old + table1 ^ table2 + + # New + table1.restrict(table2, semantic_check=False) + ``` + +4. **Replace `dj.U * table`**: ```python # Old dj.U('attr') * table @@ -571,7 +613,7 @@ Ensure existing well-designed schemas continue to work without modification. dj.U('attr') & table ``` -4. **Resolve namesake conflicts**: +5. **Resolve namesake conflicts**: ```python # If error on Student * Course (both have 'id') Student * Course.proj(course_id='id') diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index 62359be94..9c6203bdb 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -152,9 +152,10 @@ def make_subquery(self): result._heading = self.heading.make_subquery_heading() return result - def restrict(self, restriction): + def restrict(self, restriction, semantic_check=True): """ Produces a new expression with the new restriction applied. + rel.restrict(restriction) is equivalent to rel & restriction. rel.restrict(Not(restriction)) is equivalent to rel - restriction The primary key of the result is unaffected. @@ -195,7 +196,9 @@ def restrict(self, restriction): ultimately call restrict() :param restriction: a sequence or an array (treated as OR list), another QueryExpression, an SQL condition - string, or an AndList. + string, or an AndList. + :param semantic_check: if True (default), check that namesake attributes have the same lineage. + If False, bypass semantic check (use for legacy compatibility or intentional cross-lineage restriction). """ attributes = set() if isinstance(restriction, Top): @@ -204,6 +207,12 @@ def restrict(self, restriction): ) # make subquery to avoid overwriting existing Top result._top = restriction return result + # Wrap in PromiscuousOperand if semantic check is disabled + if not semantic_check: + if isinstance(restriction, Not): + restriction = Not(PromiscuousOperand(restriction.restriction)) + else: + restriction = PromiscuousOperand(restriction) new_condition = make_condition(self, restriction, attributes) if new_condition is True: return self # restriction has no effect, return the same object @@ -241,13 +250,13 @@ def __and__(self, restriction): def __xor__(self, restriction): """ - Permissive restriction operator ignoring compatibility check e.g. ``q1 ^ q2``. + The ^ operator (permissive restriction) has been removed in DataJoint 2.0. + Use .restrict(other, semantic_check=False) instead. """ - if inspect.isclass(restriction) and issubclass(restriction, QueryExpression): - restriction = restriction() - if isinstance(restriction, Not): - return self.restrict(Not(PromiscuousOperand(restriction.restriction))) - return self.restrict(PromiscuousOperand(restriction)) + raise DataJointError( + "The ^ operator has been removed in DataJoint 2.0. " + "Use .restrict(other, semantic_check=False) for permissive restrictions." + ) def __sub__(self, restriction): """