From bcf809b700a54a55258bbb47f4e1e72700bdde63 Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Sat, 4 Oct 2025 13:29:09 -0700 Subject: [PATCH 1/8] Added a 'how it works' --- HOWITWORKS.md | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 HOWITWORKS.md diff --git a/HOWITWORKS.md b/HOWITWORKS.md new file mode 100644 index 0000000..6248a91 --- /dev/null +++ b/HOWITWORKS.md @@ -0,0 +1,75 @@ +ProofOfThought uses a Large Language Model (LLM) to translate natural +language questions into formal logic programs, which are then solved by the Z3 +theorem prover to determine a final `True` or `False` answer. + +Here is a step-by-step breakdown of how the project functions: + +### 1. High-Level User Query + +A user starts by posing a question in natural language to the `ProofOfThought` class, which is the main high-level API for the system. + +```python +// From README.md +pot = ProofOfThought(llm_client=client) +result = pot.query("Would Nancy Pelosi publicly denounce abortion?") +``` + +### 2. Program Generation via LLM + +Instead of answering the question directly, the system uses an LLM (like GPT-4o) to act as a programmer. + +* **Prompting**: The user's question is embedded into a highly detailed prompt template. This prompt instructs the LLM on how to decompose the question and convert it into a JSON-based Domain-Specific Language (DSL) that represents the problem's logic. +* **Logical Decomposition**: The LLM breaks down the question. For "Would Nancy Pelosi publicly denounce abortion?", it reasons that: + 1. Nancy Pelosi is a person and abortion is an issue. + 2. A known fact is that Nancy Pelosi supports abortion rights. + 3. A general rule is that people do not publicly denounce things they support. +* **JSON Program Creation**: The LLM translates this reasoning into a JSON program. This program defines sorts (types like `Person`), constants (`nancy_pelosi`), functions (`supports`), a knowledge base (facts and rules), and a single verification condition to test the original question. + +```json +// Example generated JSON from the project's documentation +{ + "sorts": [...], + "functions": [...], + "constants": { + "persons": {"sort": "Person", "members": ["nancy_pelosi"]}, + "issues": {"sort": "Issue", "members": ["abortion"]} + }, + "knowledge_base": [ + {"assertion": "supports(nancy_pelosi, abortion)"}, + {"assertion": "ForAll([p, i], Implies(supports(p, i), Not(publicly_denounce(p, i))))"} + ], + "verifications": [ + {"name": "Pelosi Denounce Abortion", "constraint": "publicly_denounce(nancy_pelosi, abortion)"} + ], + "actions": ["verify_conditions"] +} +``` + +### 3. Z3 Theorem Proving + +* **Interpretation**: The generated JSON program is passed to the `Z3JSONInterpreter`. This component reads the file and translates the DSL into commands and assertions for the Z3 theorem prover. +* **Execution**: The Z3 solver loads the facts from the `"knowledge_base"` and then checks if the `"constraint"` from the `"verifications"` section can be true. The result is one of the following: + * **`UNSAT` (Unsatisfiable)**: The constraint contradicts the knowledge base. The system interprets this as `False`. + * **`SAT` (Satisfiable)**: The constraint is logically consistent with the knowledge base. The system interprets this as `True`. + * **`UNKNOWN`**: The solver timed out. + +In the Nancy Pelosi example, Z3 would find that "publicly_denounce(nancy_pelosi, abortion)" is a contradiction to the knowledge base, resulting in `UNSAT` and a final answer of `False`. + +### 4. Self-Correction and Error Handling + +A key feature of the project is its automatic retry mechanism. If any step fails, the system attempts to correct itself. + +* **JSON or Z3 Errors**: If the LLM generates invalid JSON, or if the Z3 interpreter fails (e.g., due to a logical error in the program), the error message is captured. +* **Feedback Loop**: The system initiates another call to the LLM. This time, it provides the original question, the faulty program, and the specific error trace. It then asks the LLM to "fix the JSON accordingly." +* **Retry Attempts**: This feedback loop continues for a configurable number of attempts (`max_attempts`), allowing the LLM to iteratively debug and correct its own generated code. + +### 5. Architecture and Evaluation + +The project is structured into two main layers, making it both powerful and easy to use: + +* **High-level API (`z3dsl.reasoning`)**: Provides the simple `ProofOfThought` and `EvaluationPipeline` classes for end-users, abstracting away the complex underlying logic. +* **Low-level DSL (`z3dsl`)**: Consists of the interpreter, verifier, and program generator that handle the core workflow of converting questions to JSON and executing them with Z3. + +The system also includes an `EvaluationPipeline` to benchmark performance on +datasets like StrategyQA, automatically calculating metrics such as accuracy, +precision, and recall. From c7d669873c5fdb468e581688ebfc7b95f2bc3871 Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Sat, 4 Oct 2025 13:52:37 -0700 Subject: [PATCH 2/8] Added an introductory paragraph and changed the data structure's code format to 'javascript' which formats propertly (it's not valid JSON so parts were rendering in red color in Github) --- HOWITWORKS.md | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/HOWITWORKS.md b/HOWITWORKS.md index 6248a91..2633e7d 100644 --- a/HOWITWORKS.md +++ b/HOWITWORKS.md @@ -1,12 +1,25 @@ -ProofOfThought uses a Large Language Model (LLM) to translate natural -language questions into formal logic programs, which are then solved by the Z3 -theorem prover to determine a final `True` or `False` answer. - -Here is a step-by-step breakdown of how the project functions: +`ProofOfThought` is a groundbreaking initiative designed to solve one of the +most significant challenges in modern artificial intelligence: the tendency of +Large Language Models (LLMs) to "hallucinate" or generate plausible but +logically flawed answers. Its core innovation lies in creating a powerful +hybrid reasoning system that combines the flexible, human-like understanding of +LLMs with the rigorous, verifiable certainty of a formal theorem prover. +Instead of simply trusting an LLM's direct answer, this project tasks the LLM +with a more structured goal: to act as a programmer that translates a user's +question into a symbolic logic program. This program is then executed by the Z3 +theorem prover, a trusted engine that guarantees the final `True` or `False` +result is not just a guess, but a conclusion derived from logically consistent +facts and rules. The most inventive aspect is its automated self-correction +loop; if the generated logic program fails, the system feeds the error back to +the LLM, enabling it to learn from its mistakes and refine its reasoning until +it produces a program that is both syntactically correct and logically sound. + +Here's how it works: ### 1. High-Level User Query -A user starts by posing a question in natural language to the `ProofOfThought` class, which is the main high-level API for the system. +A user starts by posing a question in natural language to the `ProofOfThought` +class, which is the main high-level API for the system. ```python // From README.md @@ -16,7 +29,7 @@ result = pot.query("Would Nancy Pelosi publicly denounce abortion?") ### 2. Program Generation via LLM -Instead of answering the question directly, the system uses an LLM (like GPT-4o) to act as a programmer. +Instead of answering the question directly, the system uses an LLM (e.g. `GPT-5`) to act as a programmer. * **Prompting**: The user's question is embedded into a highly detailed prompt template. This prompt instructs the LLM on how to decompose the question and convert it into a JSON-based Domain-Specific Language (DSL) that represents the problem's logic. * **Logical Decomposition**: The LLM breaks down the question. For "Would Nancy Pelosi publicly denounce abortion?", it reasons that: @@ -25,7 +38,7 @@ Instead of answering the question directly, the system uses an LLM (like GPT-4o) 3. A general rule is that people do not publicly denounce things they support. * **JSON Program Creation**: The LLM translates this reasoning into a JSON program. This program defines sorts (types like `Person`), constants (`nancy_pelosi`), functions (`supports`), a knowledge base (facts and rules), and a single verification condition to test the original question. -```json +```javascript // Example generated JSON from the project's documentation { "sorts": [...], From 7043af2073c9280d2f15692bd3ab40872634e98c Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Sat, 4 Oct 2025 13:54:15 -0700 Subject: [PATCH 3/8] More formatting fixes --- HOWITWORKS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/HOWITWORKS.md b/HOWITWORKS.md index 2633e7d..98df530 100644 --- a/HOWITWORKS.md +++ b/HOWITWORKS.md @@ -22,7 +22,10 @@ A user starts by posing a question in natural language to the `ProofOfThought` class, which is the main high-level API for the system. ```python -// From README.md +from openai import OpenAI +from z3dsl.reasoning import ProofOfThought + +client = OpenAI() pot = ProofOfThought(llm_client=client) result = pot.query("Would Nancy Pelosi publicly denounce abortion?") ``` From efaca286b0141dc225a0493e5f01333e1d6e74a1 Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Sat, 4 Oct 2025 13:55:06 -0700 Subject: [PATCH 4/8] Add a title --- HOWITWORKS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/HOWITWORKS.md b/HOWITWORKS.md index 98df530..17818d8 100644 --- a/HOWITWORKS.md +++ b/HOWITWORKS.md @@ -1,3 +1,5 @@ +## How `ProofOfThought` Works + `ProofOfThought` is a groundbreaking initiative designed to solve one of the most significant challenges in modern artificial intelligence: the tendency of Large Language Models (LLMs) to "hallucinate" or generate plausible but From d7293ec112e0c7623d0790c9c8badfbfca0b3720 Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Sat, 4 Oct 2025 13:56:15 -0700 Subject: [PATCH 5/8] Formatting in the intro paragraph, and removing some fluff. --- HOWITWORKS.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/HOWITWORKS.md b/HOWITWORKS.md index 17818d8..9a4a36a 100644 --- a/HOWITWORKS.md +++ b/HOWITWORKS.md @@ -1,20 +1,20 @@ ## How `ProofOfThought` Works -`ProofOfThought` is a groundbreaking initiative designed to solve one of the -most significant challenges in modern artificial intelligence: the tendency of -Large Language Models (LLMs) to "hallucinate" or generate plausible but -logically flawed answers. Its core innovation lies in creating a powerful -hybrid reasoning system that combines the flexible, human-like understanding of -LLMs with the rigorous, verifiable certainty of a formal theorem prover. -Instead of simply trusting an LLM's direct answer, this project tasks the LLM -with a more structured goal: to act as a programmer that translates a user's -question into a symbolic logic program. This program is then executed by the Z3 -theorem prover, a trusted engine that guarantees the final `True` or `False` -result is not just a guess, but a conclusion derived from logically consistent -facts and rules. The most inventive aspect is its automated self-correction -loop; if the generated logic program fails, the system feeds the error back to -the LLM, enabling it to learn from its mistakes and refine its reasoning until -it produces a program that is both syntactically correct and logically sound. +`ProofOfThought` solves one of the most significant challenges in modern +artificial intelligence: the tendency of Large Language Models (LLMs) to +"hallucinate" or generate plausible but *logically flawed* answers. Its core +innovation lies in creating a powerful hybrid reasoning system that combines +the flexible, human-like understanding of LLMs with the rigorous, verifiable +certainty of a formal theorem prover. Instead of simply trusting an LLM's +direct answer, this project tasks the LLM with a more structured goal: to act +as a programmer that translates a user's question into a symbolic logic +program. This program is then executed by the Z3 theorem prover, a trusted +engine that guarantees the final `True` or `False` result is not just a guess, +but a conclusion derived from logically consistent facts and rules. The most +inventive aspect is its automated self-correction loop; if the generated logic +program fails, the system feeds the error back to the LLM, enabling it to learn +from its mistakes and refine its reasoning until it produces a program that is +both syntactically correct and logically sound. Here's how it works: From c17cec9e820d0cdd7df427912a159d1aec59b92f Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Sun, 5 Oct 2025 11:40:58 -0700 Subject: [PATCH 6/8] Added a section on limitations and security considerations. --- HOWITWORKS.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/HOWITWORKS.md b/HOWITWORKS.md index 9a4a36a..366dec8 100644 --- a/HOWITWORKS.md +++ b/HOWITWORKS.md @@ -91,3 +91,50 @@ The project is structured into two main layers, making it both powerful and easy The system also includes an `EvaluationPipeline` to benchmark performance on datasets like StrategyQA, automatically calculating metrics such as accuracy, precision, and recall. + +## Limitations and Security Considerations + +While the ProofOfThought architecture is innovative in its use of Z3 as a +logical backstop, the LLM's role as the "programmer" remains a potential point +of failure, especially concerning bias and fallacies. + +For a large system where manual inspection is impossible, ensuring logical +integrity requires a multi-layered, automated "defense-in-depth" strategy. +Here’s how you could approach this: + +### 1. Advanced Prompt Engineering and Constraints + +The first line of defense is to constrain the LLM's behavior before it even +generates the program. + +* **Principle of Charity and Neutrality:** The prompt can be explicitly engineered to instruct the LLM to act as a neutral logician. You can include instructions like: "Adhere to the principle of charity: represent all arguments and premises in their strongest, most plausible form, not as caricatures. Avoid using emotionally loaded terms in function or constant names. Decompose facts into neutral, verifiable statements." +* **Mandatory Assumption Listing:** For questions involving subjective or ambiguous topics, you can require the LLM to create a specific section in its reasoning (or even within the JSON) called `"Assumptions"`. This makes implicit biases explicit. For example, in a political question, an assumption might be `"The primary goal of a political party is to win elections."` This assumption can then be flagged for review. +* **Fallacy Checklist Instruction:** The prompt can conclude with a command: "Before finalizing the program, review your logical decomposition for common fallacies, including *ad hominem*, *straw man*, *false dichotomy*, and *overgeneralization*. If any are present, reformulate the logic to be more precise." + +### 2. Automated Program Auditing (The Most Critical Step) + +This is where you build automated systems to inspect the generated JSON program *before* it gets to the Z3 solver. This is the scalable replacement for manual reading. + +* **The "Auditor LLM" Pattern:** A powerful technique is to use a second, independent LLM as a dedicated auditor. + * **How it works:** The first LLM (the "Programmer") generates the JSON. This JSON is then passed to a second LLM (the "Auditor") with a completely different prompt: *"You are a specialist in detecting logical fallacies and cognitive biases. Analyze the following JSON program. Does the 'knowledge_base' contain biased assumptions, stereotypes, or fallacious reasoning? Is the 'verification' constraint a neutral and accurate representation of the original question? Provide a 'risk score' from 1-10 and explain your reasoning."* + * **Why it's effective:** This creates an adversarial setup. The Auditor LLM isn't tasked with solving the problem, only with critiquing the logic. It's more likely to catch flaws. If the risk score is high, the program can be rejected and sent back to the Programmer LLM with the Auditor's feedback. + +* **Semantic Rule-Based Checkers:** You can programmatically scan the generated JSON for common fallacy patterns: + * **Overgeneralization (`ForAll` misuse):** Automatically flag any `ForAll` quantifier that applies to a broad, diverse class of humans with a subjective predicate. For example, a rule like `"ForAll([p], Implies(is_politician(p), is_dishonest(p)))"` is a gross overgeneralization and can be caught by a simple programmatic check. + * **Ad Hominem Detection:** Scan for functions that attack a person's character rather than their arguments. Functions named `is_untrustworthy(person)` or `has_bad_motives(person)` are red flags that can be automatically detected. + * **Source Citation Requirement:** Modify the system to require the LLM to add metadata to its `knowledge_base` facts, indicating their source (e.g., `"assertion": "supports(nancy_pelosi, abortion)", "source": "public voting record"`). The automated auditor can then flag any "facts" that are asserted without a source. + +### 3. Leveraging the Z3 Solver Itself for Sanity Checks + +While Z3 can't detect bias in premises, it is the ultimate tool for detecting logical contradictions that often arise from flawed or biased reasoning. + +* **Detecting Inconsistent Biases:** Sometimes, an LLM might introduce multiple, contradictory biases into the knowledge base. For instance, it might assert `"All regulations harm the economy"` and also `"Environmental regulations are necessary for long-term growth."` Before even checking the main verification, the system can ask Z3 to simply check if the `knowledge_base` is consistent on its own. If Z3 returns `UNSAT`, it means the LLM's foundational axioms are contradictory, and the program can be rejected without further analysis. + +### 4. System-Level Architectural Patterns + +For extremely high-stakes applications, more complex system designs can be used. + +* **Multi-Agent Debate:** Instead of one LLM, have three independent LLMs generate a logic program for the same question. Then, a "Reconciler" agent (which could be another LLM or a programmatic rule-set) compares the three programs. If they all agree on the logical structure, confidence is high. If they differ, it indicates ambiguity or potential bias, and the system can flag the question for human review or request clarification. +* **Human-in-the-Loop (HITL) for Edge Cases:** No automated system will be perfect. The goal is to automate the 99% of straightforward cases and reliably flag the 1% that are problematic. The automated auditors can route high-risk, ambiguous, or socially sensitive questions to a human expert dashboard for review. This is the scalable version of "reading everything"—you only read what the machine isn't sure about. + +By implementing these layers, you move from a position of "trusting the LLM" to a system where the LLM's output is continuously verified, audited, and stress-tested, making it far more robust and less susceptible to the subtle biases and fallacies that are inherent in its training data. From 0eddafccac60fbdf3879f32250114d99422d6a00 Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Mon, 6 Oct 2025 07:25:14 -0700 Subject: [PATCH 7/8] Prefer structured responses when calling the LLM if that capability exists --- benchmark_pipeline.py | 128 ++++++++--------- z3dsl/reasoning/program_generator.py | 203 +++++++++++++++++---------- 2 files changed, 184 insertions(+), 147 deletions(-) diff --git a/benchmark_pipeline.py b/benchmark_pipeline.py index 1e0f73d..75f103f 100644 --- a/benchmark_pipeline.py +++ b/benchmark_pipeline.py @@ -18,6 +18,7 @@ ) from z3dsl.interpreter import Z3JSONInterpreter +from z3dsl.reasoning.program_generator import Z3ProgramGenerator def calculate_metrics(y_true: list[Any], y_pred: list[Any]) -> dict[str, Any]: @@ -125,6 +126,7 @@ def run_z3_interpreter(output_json_path: str) -> tuple[bool | None, str]: ) client = OpenAI(api_key=api_key) +program_generator = Z3ProgramGenerator(llm_client=client, model="gpt-4o") # Load the StrategyQA dataset with open("strategyqa_train.json") as f: @@ -844,89 +846,69 @@ def run_z3_interpreter(output_json_path: str) -> tuple[bool | None, str]: num_attempts += 1 try: if num_attempts == 1: - # First attempt - messages = [ - {"role": "user", "content": [{"type": "text", "text": initial_prompt_content}]} - ] + generation = program_generator.generate( + question_text, + temperature=0.1, + max_tokens=2048, + ) else: - # Subsequent attempts - messages = [ - { - "role": "assistant", - "content": [{"type": "text", "text": previous_response or ""}], - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"There was an error processing your response:\n{error_trace}\nPlease fix the JSON accordingly.", - } - # {"role": "user", "content": [{ "type" : "text", "text" : initial_prompt_content}]}, - ], - }, - ] - - # Make the OpenAI API call - response = client.chat.completions.create( - model="gpt-4o", - messages=messages, - temperature=0.1, - max_tokens=2048, - top_p=1, - frequency_penalty=0, - presence_penalty=0, - response_format={"type": "text"}, - ) + generation = program_generator.generate_with_feedback( + question_text, + error_trace or "", + previous_response or "", + temperature=0.1, + max_tokens=2048, + ) - response_data = response.choices[0].message.content + response_data = generation.raw_response + previous_response = response_data - # Save the response response_path = os.path.join(response_dir, f"{qid}_response_attempt{num_attempts}.json") with open(response_path, "w") as f: json.dump({"response_content": response_data}, f, indent=4) - markdown_content = response_data - previous_response = markdown_content - - extracted_json = extract_json_from_markdown(markdown_content or "") - if extracted_json: - with open(output_json_path, "w") as f: - json.dump(extracted_json, f, indent=4) - try: - predicted_answer, interpreter_output = run_z3_interpreter(output_json_path) - - logger.info(f"Interpreter Output:\n{interpreter_output}") - - if predicted_answer is not None: - logger.info(f"Answer Computed : ({predicted_answer}, {actual_answer})") - if predicted_answer == actual_answer: - correct_answers += 1 - logger.info( - f"Question {qid} answered correctly on attempt {num_attempts}" - ) - else: - wrong_answers += 1 - logger.info( - f"Question {qid} answered incorrectly on attempt {num_attempts}" - ) - success = True + if not generation.json_program: + error_trace = generation.error or "Failed to generate structured program" + logger.warning( + "Generation failed for question %s on attempt %s: %s", + qid, + num_attempts, + error_trace, + ) + continue + + with open(output_json_path, "w") as f: + json.dump(generation.json_program, f, indent=4) + + try: + predicted_answer, interpreter_output = run_z3_interpreter(output_json_path) + + logger.info(f"Interpreter Output:\n{interpreter_output}") + + if predicted_answer is not None: + logger.info(f"Answer Computed : ({predicted_answer}, {actual_answer})") + if predicted_answer == actual_answer: + correct_answers += 1 + logger.info(f"Question {qid} answered correctly on attempt {num_attempts}") else: - logger.warning( - f"Could not determine the answer for question {qid} on attempt {num_attempts}" - ) - error_trace = f"Ambiguous output: SAT and UNSAT occurrences don't clearly indicate an answer.\nFull output:\n{interpreter_output}" - success = False - - except Exception as e: - error_trace = f"Error running interpreter: {str(e)}\nFull traceback:\n{''.join(traceback.format_exception(type(e), e, e.__traceback__))}" - logger.error( - f"Error running interpreter for question {qid} on attempt {num_attempts}:\n{error_trace}" + wrong_answers += 1 + logger.info(f"Question {qid} answered incorrectly on attempt {num_attempts}") + success = True + else: + logger.warning( + f"Could not determine the answer for question {qid} on attempt {num_attempts}" + ) + error_trace = ( + "Ambiguous output: SAT and UNSAT occurrences don't clearly indicate an answer." + f"\nFull output:\n{interpreter_output}" ) success = False - else: - error_trace = "Failed to extract JSON from the response." - logger.error(f"Failed to extract JSON for question {qid} on attempt {num_attempts}") + + except Exception as e: + error_trace = f"Error running interpreter: {str(e)}\nFull traceback:\n{''.join(traceback.format_exception(type(e), e, e.__traceback__))}" + logger.error( + f"Error running interpreter for question {qid} on attempt {num_attempts}:\n{error_trace}" + ) success = False except Exception as e: diff --git a/z3dsl/reasoning/program_generator.py b/z3dsl/reasoning/program_generator.py index 8d5dc12..47482f4 100644 --- a/z3dsl/reasoning/program_generator.py +++ b/z3dsl/reasoning/program_generator.py @@ -6,10 +6,17 @@ from dataclasses import dataclass from typing import Any +from z3dsl.reasoning.structured_program import StructuredProgram + from z3dsl.reasoning.prompt_template import build_prompt logger = logging.getLogger(__name__) +STRUCTURED_SYSTEM_PROMPT = ( + "You are an assistant that produces Z3 DSL programs. " + "Always respond with JSON that matches the provided schema." +) + @dataclass class GenerationResult: @@ -50,48 +57,39 @@ def generate( Returns: GenerationResult with JSON program or error """ - try: - prompt = build_prompt(question) + prompt = build_prompt(question) + messages = self._initial_messages(prompt) - # Make LLM API call (compatible with both OpenAI and Azure OpenAI) - # Azure OpenAI requires content as string, not list - # GPT-5 only supports temperature=1 (default), so don't pass it - response = self.llm_client.chat.completions.create( - model=self.model, - messages=[{"role": "user", "content": prompt}], - max_completion_tokens=max_tokens, - ) + # Prefer structured outputs when available + structured_result = self._call_structured_api(messages, temperature, max_tokens) + if structured_result: + json_program, raw_response = structured_result + return GenerationResult(json_program=json_program, raw_response=raw_response, success=True) - raw_response = response.choices[0].message.content - - # Extract JSON from markdown code block - json_program = self._extract_json(raw_response) - - if json_program: - return GenerationResult( - json_program=json_program, - raw_response=raw_response, - success=True, - ) - else: - # Log the raw response to help debug extraction failures - logger.debug(f"Raw LLM response:\n{raw_response[:1000]}...") - return GenerationResult( - json_program=None, - raw_response=raw_response, - success=False, - error="Failed to extract valid JSON from response", - ) - - except Exception as e: - logger.error(f"Error generating program: {e}") + # Fallback to legacy chat completions parsing + try: + raw_response = self._call_chat_completion(messages, max_tokens) + except Exception as exc: + logger.error(f"Error generating program: {exc}") return GenerationResult( json_program=None, raw_response="", success=False, - error=str(e), + error=str(exc), ) + json_program = self._extract_json(raw_response) + if json_program: + return GenerationResult(json_program=json_program, raw_response=raw_response, success=True) + + logger.debug(f"Raw LLM response:\n{raw_response[:1000]}...") + return GenerationResult( + json_program=None, + raw_response=raw_response, + success=False, + error="Failed to extract valid JSON from response", + ) + def generate_with_feedback( self, question: str, @@ -112,53 +110,110 @@ def generate_with_feedback( Returns: GenerationResult with corrected JSON program """ - try: - prompt = build_prompt(question) - feedback_message = ( - f"There was an error processing your response:\n{error_trace}\n" - "Please fix the JSON accordingly." - ) + prompt = build_prompt(question) + feedback_message = ( + f"There was an error processing your response:\n{error_trace}\n" + "Please fix the JSON accordingly." + ) - # Multi-turn conversation with error feedback - # Compatible with both OpenAI and Azure OpenAI - # GPT-5 only supports temperature=1 (default), so don't pass it - response = self.llm_client.chat.completions.create( - model=self.model, - messages=[ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": previous_response}, - {"role": "user", "content": feedback_message}, - ], - max_completion_tokens=max_tokens, - ) + messages = self._feedback_messages(prompt, previous_response, feedback_message) - raw_response = response.choices[0].message.content - json_program = self._extract_json(raw_response) - - if json_program: - return GenerationResult( - json_program=json_program, - raw_response=raw_response, - success=True, - ) - else: - # Log the raw response to help debug extraction failures - logger.debug(f"Raw LLM feedback response:\n{raw_response[:1000]}...") - return GenerationResult( - json_program=None, - raw_response=raw_response, - success=False, - error="Failed to extract valid JSON from feedback response", - ) - - except Exception as e: - logger.error(f"Error generating program with feedback: {e}") + structured_result = self._call_structured_api(messages, temperature, max_tokens) + if structured_result: + json_program, raw_response = structured_result + return GenerationResult(json_program=json_program, raw_response=raw_response, success=True) + + try: + raw_response = self._call_chat_completion(messages, max_tokens) + except Exception as exc: + logger.error(f"Error generating program with feedback: {exc}") return GenerationResult( json_program=None, raw_response="", success=False, - error=str(e), + error=str(exc), + ) + + json_program = self._extract_json(raw_response) + if json_program: + return GenerationResult(json_program=json_program, raw_response=raw_response, success=True) + + logger.debug(f"Raw LLM feedback response:\n{raw_response[:1000]}...") + return GenerationResult( + json_program=None, + raw_response=raw_response, + success=False, + error="Failed to extract valid JSON from feedback response", + ) + + def _call_structured_api( + self, + messages: list[dict[str, str]], + temperature: float, + max_tokens: int, + ) -> tuple[dict[str, Any], str] | None: + """Attempt to call the Responses API with structured outputs.""" + + responses_api = getattr(self.llm_client, "responses", None) + parse_fn = getattr(responses_api, "parse", None) if responses_api else None + if parse_fn is None: + return None + + try: + response = parse_fn( + model=self.model, + input=messages, + temperature=temperature, + max_output_tokens=max_tokens, + text_format=StructuredProgram, + ) + except Exception as exc: + logger.warning( + "Structured output request failed, falling back to chat completions: %s", + exc, ) + return None + + parsed_program: StructuredProgram | None = getattr(response, "output_parsed", None) + if parsed_program is None: + logger.warning("Structured response did not contain parsed output; falling back") + return None + + json_text = parsed_program.model_dump_json(indent=2, exclude_none=True) + json_program = parsed_program.to_dsl_dict() + return json_program, json_text + + def _call_chat_completion( + self, messages: list[dict[str, str]], max_tokens: int + ) -> str: + """Call legacy chat completions API and return raw assistant content.""" + + response = self.llm_client.chat.completions.create( + model=self.model, + messages=messages, + max_completion_tokens=max_tokens, + ) + return response.choices[0].message.content + + def _initial_messages(self, prompt: str) -> list[dict[str, str]]: + """Construct the initial message sequence used for generation.""" + + return [ + {"role": "system", "content": STRUCTURED_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ] + + def _feedback_messages( + self, prompt: str, previous_response: str, feedback_message: str + ) -> list[dict[str, str]]: + """Construct message sequence when providing feedback to the model.""" + + return [ + {"role": "system", "content": STRUCTURED_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + {"role": "assistant", "content": previous_response}, + {"role": "user", "content": feedback_message}, + ] def _extract_json(self, markdown_content: str) -> dict[str, Any] | None: """Extract JSON from markdown code block. From fc4b433f4e9ec3d35154e747b5f4b5f9a2bb4157 Mon Sep 17 00:00:00 2001 From: Ken Simpson Date: Mon, 6 Oct 2025 07:29:21 -0700 Subject: [PATCH 8/8] Adding a couple of files I neglected to commit --- examples/with_second_llm_verification.py | 272 +++++++++++++++++++++++ setup_venv.sh | 43 ++++ z3dsl/reasoning/structured_program.py | 210 +++++++++++++++++ 3 files changed, 525 insertions(+) create mode 100644 examples/with_second_llm_verification.py create mode 100755 setup_venv.sh create mode 100644 z3dsl/reasoning/structured_program.py diff --git a/examples/with_second_llm_verification.py b/examples/with_second_llm_verification.py new file mode 100644 index 0000000..b0ac321 --- /dev/null +++ b/examples/with_second_llm_verification.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +Example: Implementing an Audited Reasoning Workflow with ProofOfThought. + +This program demonstrates a "defense-in-depth" strategy for using the +ProofOfThought system in a large-scale, automated environment. It addresses +the challenge of ensuring that the LLM's logical decompositions are free +from bias and fallacies, which is impossible to do manually at scale. + +The script defines an `AuditedProofOfThought` class that wraps the core +components of the `z3dsl` library. The workflow is as follows: + +1. **Programmer LLM**: A primary LLM is prompted with advanced instructions + to act as a neutral logician and generate a Z3 DSL program from a question. +2. **Auditor LLM**: A second LLM, with a different set of instructions, + acts as a critical specialist. It reviews the generated program for + bias, fallacies, and emotionally loaded terms. +3. **Audit & Self-Correction**: + - If the Auditor deems the program safe, it proceeds to verification. + - If the Auditor flags the program as high-risk, its feedback is + sent back to the Programmer LLM, which attempts to generate a corrected, + more neutral program. +4. **Verification**: Only after a program passes the audit is it sent to the + Z3Verifier for a final, logically sound True/False answer. + +This example requires the `openai` and `z3-solver` packages to be installed, +and an `OPENAI_API_KEY` to be set in your environment variables. +""" + +import os +import json +import logging +import tempfile +from typing import Any, Dict, Optional, Tuple + +from pydantic import BaseModel, ConfigDict + +# Assume 'z3dsl' is installed and available in the Python path. +# In a real repository, these imports would work directly. +from z3dsl.reasoning import Z3ProgramGenerator, Z3Verifier, VerificationResult, GenerationResult + +# For this example, we'll use the OpenAI client, but any client compatible +# with the library's LLM interface would work. +from openai import OpenAI + +# --- Configuration --- +# Configure logging for clear output +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + +# Load OpenAI API key from environment variables +api_key = os.getenv("OPENAI_API_KEY") +if not api_key: + raise ValueError("OPENAI_API_KEY environment variable not set. Please set it to your API key.") + +# We can use the same model for both roles or different ones. +# Using a powerful model for both is recommended for best results. +PROGRAMMER_MODEL = "gpt-5" +AUDITOR_MODEL = "gpt-5" + + +class AuditReport(BaseModel): + """Structured response returned by the auditor model.""" + + risk_score: int + is_safe: bool + justification: str + + model_config = ConfigDict(extra="forbid") + + +class AuditedProofOfThought: + """ + An enhanced reasoning system that adds a layer of automated auditing + and self-correction to the ProofOfThought process. + """ + + def __init__(self, llm_client: Any, risk_threshold: int = 5): + """ + Initializes the audited reasoning system. + + Args: + llm_client: An OpenAI-compatible client. + risk_threshold: A score (1-10) above which a program is considered biased. + """ + self.programmer_generator = Z3ProgramGenerator(llm_client=llm_client, model=PROGRAMMER_MODEL) + self.auditor_llm = llm_client + self.verifier = Z3Verifier() + self.risk_threshold = risk_threshold + logging.info("AuditedProofOfThought system initialized.") + + def _get_programmer_prompt(self, question: str) -> str: + """Applies advanced prompt engineering for neutrality.""" + # This function would ideally import the `build_prompt` from the library + # and prepend these instructions. For this standalone example, we create it here. + from z3dsl.reasoning.prompt_template import DSL_INSTRUCTIONS + + neutrality_instructions = """ + **CRITICAL Directives for Neutrality and Accuracy**: + 1. **Principle of Charity**: You are a neutral logician. Represent all concepts in their most objective, verifiable, and quantitative form. + 2. **Avoid Subjectivity**: Do not use emotionally loaded or subjective terms (e.g., 'good', 'bad', 'chaotic'). Instead, model the world using measurable metrics (e.g., 'air_quality_score', 'cost_in_usd'). + 3. **Fallacy Check**: Before outputting the JSON, you must review your logic for common fallacies (ad hominem, straw man, false dichotomy, overgeneralization). Your primary goal is logical soundness, not persuasive rhetoric. + --- + """ + return neutrality_instructions + DSL_INSTRUCTIONS + f"\nQuestion: {question}" + + def _get_auditor_prompt(self, json_program_str: str, original_question: str) -> str: + """Creates the prompt for the Auditor LLM.""" + return f""" + You are a specialist in detecting logical fallacies, cognitive biases, and stereotypes in structured reasoning programs. Your sole purpose is to ensure neutrality. + + Analyze the following JSON program, which was generated to answer the question: "{original_question}" + + Program: + ```json + {json_program_str} + ``` + + Your Task: + 1. Scrutinize the 'knowledge_base' for biased assumptions, overgeneralizations, or subjective opinions presented as objective facts. + 2. Examine all names (functions, constants, variables) for emotionally loaded or non-neutral language. + 3. Assess if the 'verifications' section neutrally and accurately represents the core of the original question. + 4. Provide a JSON response with exactly three fields: + - "risk_score": An integer from 1 (completely neutral) to 10 (highly biased or fallacious). + - "is_safe": A boolean (true if risk_score is below {self.risk_threshold}, otherwise false). + - "justification": A concise string explaining your reasoning, especially if the program is unsafe. This justification will be used as feedback. + """ + + def _audit_program(self, json_program: Dict, original_question: str) -> Tuple[bool, str]: + """Uses the Auditor LLM to check the generated program for bias.""" + logging.info("Auditing generated program for bias and fallacies...") + json_program_str = json.dumps(json_program, indent=2) + auditor_prompt = self._get_auditor_prompt(json_program_str, original_question) + + responses_api = getattr(self.auditor_llm, "responses", None) + parse_fn = getattr(responses_api, "parse", None) if responses_api else None + + if parse_fn is not None: + try: + structured_response = parse_fn( + model=AUDITOR_MODEL, + input=[ + { + "role": "system", + "content": ( + "You review DSL programs for bias. Respond using the provided schema." + ), + }, + {"role": "user", "content": auditor_prompt}, + ], + text_format=AuditReport, + ) + audit_report: Optional[AuditReport] = getattr( + structured_response, "output_parsed", None + ) + if audit_report is not None: + risk_score = audit_report.risk_score + is_safe = audit_report.is_safe and risk_score < self.risk_threshold + justification = audit_report.justification + + logging.info(f"Audit Complete. Risk Score: {risk_score}. Safe: {is_safe}.") + if not is_safe: + logging.warning(f"Auditor's Justification: {justification}") + + return is_safe, justification + except Exception as exc: + logging.warning( + "Structured audit request failed, falling back to legacy parsing: %s", exc + ) + + response = self.auditor_llm.chat.completions.create( + model=AUDITOR_MODEL, + messages=[{"role": "user", "content": auditor_prompt}], + response_format={"type": "json_object"}, + ) + + try: + audit_result = json.loads(response.choices[0].message.content) + risk_score = audit_result.get("risk_score", 10) + is_safe = bool(risk_score < self.risk_threshold) + justification = audit_result.get("justification", "No justification provided.") + + logging.info(f"Audit Complete. Risk Score: {risk_score}. Safe: {is_safe}.") + if not is_safe: + logging.warning(f"Auditor's Justification: {justification}") + + return is_safe, justification + except (json.JSONDecodeError, KeyError) as e: + logging.error(f"Audit failed: Could not parse auditor LLM response. Error: {e}") + return False, "Auditor LLM response was malformed." + + def query(self, question: str) -> Optional[VerificationResult]: + """Orchestrates the generate-audit-verify workflow with self-correction.""" + logging.info(f"--- Processing New Query: '{question}' ---") + + # --- Generation Attempt 1 --- + logging.info("Step 1: Generating initial logical program...") + programmer_prompt = self._get_programmer_prompt(question) + gen_result: GenerationResult = self.programmer_generator.generate(question=question) # The library builds the prompt internally + + if not gen_result.success or not gen_result.json_program: + logging.error("Initial program generation failed.") + return None + + program = gen_result.json_program + + # --- Auditing Attempt 1 --- + is_safe, justification = self._audit_program(program, question) + + if not is_safe: + logging.warning("Step 2: Audit FAILED. Attempting self-correction with feedback...") + + # --- Self-Correction Attempt --- + gen_result = self.programmer_generator.generate_with_feedback( + question=question, + error_trace=f"The initial program was rejected by a logical audit. Reason: {justification}. Please generate a new, more neutral program based on objective, verifiable metrics.", + previous_response=gen_result.raw_response + ) + + if not gen_result.success or not gen_result.json_program: + logging.error("Self-correction generation failed.") + return None + + program = gen_result.json_program + + # --- Auditing Attempt 2 (on the corrected program) --- + logging.info("Re-auditing the corrected program...") + is_safe, justification = self._audit_program(program, question) + + if not is_safe: + logging.error("Self-correction FAILED audit. Escalating for human review.") + return None + + logging.info("Step 2: Audit PASSED. Program is safe.") + + # --- Final Verification Step --- + logging.info("Step 3: Executing with Z3 Verifier...") + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json") as tmp_file: + json.dump(program, tmp_file, indent=2) + json_path = tmp_file.name + + try: + verification_result = self.verifier.verify(json_path) + logging.info(f"Step 4: Z3 Verification Complete. Final Answer: {verification_result.answer}") + return verification_result + finally: + os.remove(json_path) + + +def main(): + """Main function to run the audited reasoning examples.""" + llm_client = OpenAI(api_key=api_key) + + audited_pot = AuditedProofOfThought( + llm_client=llm_client, + risk_threshold=6 + ) + + print("\n" + "="*80) + print("SCENARIO 1: A straightforward, objective question that should pass the audit.") + print("="*80) + objective_question = "Is a Boeing 737's cost covered by the box office receipts of the film 'Wonder Woman' (2017)?" + audited_pot.query(objective_question) + + print("\n" + "="*80) + print("SCENARIO 2: A subjective, high-risk question that should fail the first audit and self-correct.") + print("="*80) + subjective_question = "Is it better to live in the city or the country?" + audited_pot.query(subjective_question) + + +if __name__ == "__main__": + main() diff --git a/setup_venv.sh b/setup_venv.sh new file mode 100755 index 0000000..229d9ca --- /dev/null +++ b/setup_venv.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$PROJECT_ROOT" + +if [[ ! -f "pyproject.toml" ]] && [[ ! -f "requirements.txt" ]]; then + echo "Error: Please run this script from the project root containing pyproject.toml or requirements.txt." >&2 + exit 1 +fi + +PYTHON_BIN="${PYTHON:-python3}" +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "Error: '$PYTHON_BIN' not found. Install Python 3 and try again." >&2 + exit 1 +fi + +VENV_DIR="${VENV_DIR:-$PROJECT_ROOT/.venv}" + +if [[ -d "$VENV_DIR" ]]; then + echo "Virtual environment already exists at $VENV_DIR" +else + echo "Creating virtual environment at $VENV_DIR" + "$PYTHON_BIN" -m venv "$VENV_DIR" +fi + +# shellcheck source=/dev/null +source "$VENV_DIR/bin/activate" + +if [[ -f "requirements.txt" ]]; then + echo "Installing dependencies from requirements.txt" + pip install --upgrade pip >/dev/null + pip install -r requirements.txt +elif [[ -f "pyproject.toml" ]]; then + echo "Installing project with pip" + pip install --upgrade pip >/dev/null + pip install -e . +fi + +echo +read -r -p "Have you added your LLM API keys to the .env file? Press Enter to confirm. " + +echo "Setup complete. Activate the environment with 'source $VENV_DIR/bin/activate'." diff --git a/z3dsl/reasoning/structured_program.py b/z3dsl/reasoning/structured_program.py new file mode 100644 index 0000000..06cd93c --- /dev/null +++ b/z3dsl/reasoning/structured_program.py @@ -0,0 +1,210 @@ +"""Structured output schema for Z3 DSL generation.""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field + + +class SortDefinition(BaseModel): + """Definition of a sort in the DSL.""" + + name: str + type: str + values: Optional[List[str]] = None + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class FunctionDefinition(BaseModel): + """Definition of a function in the DSL.""" + + name: str + domain: List[str] + range: str + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class VariableDefinition(BaseModel): + """Definition of a variable available to the DSL program.""" + + name: str + sort: str + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class KnowledgeAssertion(BaseModel): + """Knowledge base entry with an explicit truth value.""" + + assertion: str + value: bool = True + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +KnowledgeEntry = Union[str, KnowledgeAssertion] + + +class QuantifiedVariable(BaseModel): + """Variable definition used in quantifiers.""" + + name: str + sort: str + + model_config = ConfigDict(extra="forbid") + + +class RuleImplication(BaseModel): + """Implication used inside rules.""" + + antecedent: str + consequent: str + + model_config = ConfigDict(extra="forbid") + + +class RuleDefinition(BaseModel): + """Logical rule definition.""" + + name: Optional[str] = None + forall: Optional[List[QuantifiedVariable]] = None + implies: Optional[RuleImplication] = None + constraint: Optional[str] = None + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class VerificationImplication(BaseModel): + """Implication used for universal verifications.""" + + antecedent: str + consequent: str + + model_config = ConfigDict(extra="forbid") + + +class VerificationDefinition(BaseModel): + """Definition of a verification condition.""" + + name: Optional[str] = None + constraint: Optional[str] = None + exists: Optional[List[QuantifiedVariable]] = None + forall: Optional[List[QuantifiedVariable]] = None + implies: Optional[VerificationImplication] = None + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class ObjectiveDefinition(BaseModel): + """Optimization objective.""" + + type: Literal["maximize", "minimize"] + expression: str + + model_config = ConfigDict(extra="forbid") + + +class OptimizationConfig(BaseModel): + """Optimization configuration.""" + + variables: List[VariableDefinition] = Field(default_factory=list) + constraints: List[str] = Field(default_factory=list) + objectives: List[ObjectiveDefinition] = Field(default_factory=list) + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class ConstantDefinition(BaseModel): + """Definition of a named constant group.""" + + sort: str + members: Union[List[str], Dict[str, str]] + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +class StructuredProgram(BaseModel): + """Structured representation of a Z3 DSL program.""" + + sorts: List[SortDefinition] = Field(default_factory=list) + functions: List[FunctionDefinition] = Field(default_factory=list) + constants: Dict[str, ConstantDefinition] = Field(default_factory=dict) + variables: List[VariableDefinition] = Field(default_factory=list) + knowledge_base: List[KnowledgeEntry] = Field(default_factory=list) + rules: List[RuleDefinition] = Field(default_factory=list) + verifications: List[VerificationDefinition] = Field(default_factory=list) + optimization: Optional[OptimizationConfig] = None + actions: List[str] = Field(default_factory=list) + + model_config = ConfigDict(extra="forbid") + + def to_dsl_dict(self) -> Dict[str, Any]: + """Convert structured program into the interpreter JSON format.""" + + def dump_model(obj: BaseModel) -> Dict[str, Any]: + return obj.model_dump(mode="python", exclude_none=True) + + sorts_data = [dump_model(sort) for sort in self.sorts] + functions_data = [dump_model(func) for func in self.functions] + variables_data = [dump_model(var) for var in self.variables] + + knowledge_entries: List[Any] = [] + for entry in self.knowledge_base: + if isinstance(entry, BaseModel): + knowledge_entries.append(dump_model(entry)) + else: + knowledge_entries.append(entry) + + rules_data = [dump_model(rule) for rule in self.rules] + verifications_data = [dump_model(verification) for verification in self.verifications] + + actions = list(self.actions) if self.actions else ["verify_conditions"] + + program: Dict[str, Any] = { + "sorts": sorts_data, + "functions": functions_data, + "constants": { + name: dump_model(constant) + for name, constant in self.constants.items() + }, + "variables": variables_data, + "knowledge_base": knowledge_entries, + "rules": rules_data, + "verifications": verifications_data, + "actions": actions, + } + + if self.optimization: + program["optimization"] = dump_model(self.optimization) + + return program + + +__all__ = [ + "ConstantDefinition", + "FunctionDefinition", + "KnowledgeAssertion", + "KnowledgeEntry", + "OptimizationConfig", + "ObjectiveDefinition", + "QuantifiedVariable", + "RuleDefinition", + "RuleImplication", + "SortDefinition", + "StructuredProgram", + "VariableDefinition", + "VerificationDefinition", + "VerificationImplication", +]