How to Build a Fully Self-Verifying Data Operations AI Agent Using Local Hugging Face Models for Automated Planning, Execution, and Testing
These articles are AI-generated summaries. Please check the original sources for full details.
How to Build a Fully Self-Verifying Data Operations AI Agent Using Local Hugging Face Models for Automated Planning, Execution, and Testing
Asif Razzaq’s tutorial demonstrates a self-verifying DataOps AI agent using Microsoft’s Phi-2 model locally. The agent automates planning, execution, and testing with Hugging Face Transformers, ensuring privacy and efficiency.
Why This Matters
Automated data operations often fail due to unvalidated execution. This agent introduces a self-verifying workflow with planning, execution, and testing phases to ensure accuracy, reducing reliance on manual checks and minimizing errors in data pipelines.
Key Insights
- “Phi-2 model used for local inference in DataOps agent, 2025”
- “DataOpsAgent with Planner, Executor, Tester roles for self-verification”
- “Local execution avoids cloud API dependencies, enhancing privacy”
Working Example
!pip install -q transformers accelerate bitsandbytes scipy
import json, pandas as pd, numpy as np, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
MODEL_NAME = "microsoft/phi-2"
class LocalLLM:
def __init__(self, model_name=MODEL_NAME, use_8bit=False):
print(f"Loading model: {model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
model_kwargs = {"device_map": "auto", "trust_remote_code": True}
if use_8bit and torch.cuda.is_available():
model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
else:
model_kwargs["torch_dtype"] = torch.float32 if not torch.cuda.is_available() else torch.float16
self.model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer,
max_new_tokens=512, do_sample=True, temperature=0.3, top_p=0.9,
pad_token_id=self.tokenizer.eos_token_id)
print("✓ Model loaded successfully!\n")
def generate(self, prompt, system_prompt="", temperature=0.3):
if system_prompt:
full_prompt = f"Instruct: {system_prompt}\n\n{prompt}\nOutput:"
else:
full_prompt = f"Instruct: {prompt}\nOutput:"
output = self.pipe(full_prompt, temperature=temperature, do_sample=temperature>0,
return_full_text=False, eos_token_id=self.tokenizer.eos_token_id)
result = output[0]['generated_text'].strip()
if "Instruct:" in result:
result = result.split("Instruct:")[0].strip()
return result
PLANNER_PROMPT = """You are a Data Operations Planner. Create a detailed execution plan as valid JSON.
Return ONLY a JSON object (no other text) with this structure:
{"steps": ["step 1","step 2"],"expected_output":"description","validation_criteria":["criteria 1","criteria 2"]}"""
EXECUTOR_PROMPT = """You are a Data Operations Executor. Write Python code using pandas.
Requirements:
- Use pandas (imported as pd) and numpy (imported as np)
- Store final result in variable 'result'
- Return ONLY Python code, no explanations or markdown"""
TESTER_PROMPT = """You are a Data Operations Tester. Verify execution results.
Return ONLY a JSON object (no other text) with this structure:
{"passed":true,"issues":["any issues found"],"recommendations":["suggestions"]}"""
class DataOpsAgent:
def __init__(self, llm=None):
self.llm = llm or LocalLLM()
self.history = []
def _extract_json(self, text):
try:
return json.loads(text)
except:
start, end = text.find('{'), text.rfind('}')+1
if start >= 0 and end > start:
try:
return json.loads(text[start:end])
except:
pass
return None
def plan(self, task, data_info):
print("\n" + "="*60)
print("PHASE 1: PLANNING")
print("="*60)
prompt = f"Task: {task}\n\nData Information:\n{data_info}\n\nCreate an execution plan as JSON with steps, expected_output, and validation_criteria."
plan_text = self.llm.generate(prompt, PLANNER_PROMPT, temperature=0.2)
self.history.append(("PLANNER", plan_text))
plan = self._extract_json(plan_text) or {"steps":[task],"expected_output":"Processed data","validation_criteria":["Result generated","No errors"]}
print(f"\n📋 Plan Created:")
print(f" Steps: {len(plan.get('steps', []))}")
for i, step in enumerate(plan.get('steps', []), 1):
print(f" {i}. {step}")
print(f" Expected: {plan.get('expected_output', 'N/A')}")
return plan
def execute(self, plan, data_context):
print("\n" + "="*60)
print("PHASE 2: EXECUTION")
print("="*60)
steps_text = '\n'.join(f"{i}. {s}" for i, s in enumerate(plan.get('steps', []), 1))
prompt = f"Task Steps:\n{steps_text}\n\nData available: DataFrame 'df'\n{data_context}\n\nWrite Python code to execute these steps. Store final result in 'result' variable."
code = self.llm.generate(prompt, EXECUTOR_PROMPT, temperature=0.1)
self.history.append(("EXECUTOR", code))
if "```python" in code: code = code.split("```python")[1].split("```")[0]
elif "```" in code: code = code.split("```")[1].split("```")[0]
lines = []
for line in code.split('\n'):
s = line.strip()
if s and (not s.startswith('#') or 'import' in s):
lines.append(line)
code = '\n'.join(lines).strip()
print(f"\n💻 Generated Code:\n" + "-"*60)
for i, line in enumerate(code.split('\n')[:15],1):
print(f"{i:2}. {line}")
if len(code.split('\n'))>15: print(f" ... ({len(code.split('\n'))-15} more lines)")
print("-"*60)
return code
def test(self, plan, result, execution_error=None):
print("\n" + "="*60)
print("PHASE 3: TESTING & VERIFICATION")
print("="*60)
result_desc = f"EXECUTION ERROR: {execution_error}" if execution_error else f"Result type: {type(result).__name__}\n"
if not execution_error:
if isinstance(result, pd.DataFrame):
result_desc += f"Shape: {result.shape}\nColumns: {list(result.columns)}\nSample:\n{result.head(3).to_string()}"
elif isinstance(result, (int,float,str)):
result_desc += f"Value: {result}"
else:
result_desc += f"Value: {str(result)[:200]}"
criteria_text = '\n'.join(f"- {c}" for c in plan.get('validation_criteria', []))
prompt = f"Validation Criteria:\n{criteria_text}\n\nExpected: {plan.get('expected_output', 'N/A')}\n\nActual Result:\n{result_desc}\n\nEvaluate if result meets criteria. Return JSON with passed (true/false), issues, and recommendations."
test_result = self.llm.generate(prompt, TESTER_PROMPT, temperature=0.2)
self.history.append(("TESTER", test_result))
test_json = self._extract_json(test_result) or {"passed":execution_error is None,"issues":["Could not parse test result"],"recommendations":["Review manually"]}
print(f"\n✓ Test Results:\n Status: {'✅ PASSED' if test_json.get('passed') else '❌ FAILED'}")
if test_json.get('issues'):
print(" Issues:")
for issue in test_json['issues'][:3]:
print(f" • {issue}")
if test_json.get('recommendations'):
print(" Recommendations:")
for rec in test_json['recommendations'][:3]:
print(f" • {rec}")
return test_json
def run(self, task, df=None, data_info=None):
print("\n🤖 SELF-VERIFYING DATA-OPS AGENT (Local HF Model)")
print(f"Task: {task}\n")
if data_info is None and df is not None:
data_info = f"Shape: {df.shape}\nColumns: {list(df.columns)}\nSample:\n{df.head(2).to_string()}"
plan = self.plan(task, data_info)
code = self.execute(plan, data_info)
result, error = None, None
try:
local_vars = {'pd': pd, 'np': np, 'df': df}
exec(code, local_vars)
result = local_vars.get('result')
except Exception as e:
error = str(e)
print(f"\n⚠️ Execution Error: {error}")
test_result = self.test(plan, result, error)
return {'plan': plan,'code': code,'result': result,'test': test_result,'history': self.history}
Practical Applications
- Use Case: “Sales data aggregation using automated planning and execution”
- Pitfall: “Over-reliance on LLM-generated code without manual review leading to uncaught errors”
References:
Continue reading
Next article
Fixing the gradlew: command not found Error in Linux
Related Content
How to Build a Fully Autonomous Local Fleet-Maintenance Analysis Agent Using SmolAgents and Qwen Model
Build a fleet maintenance agent with SmolAgents and Qwen, achieving fully autonomous analysis and visualization without external API calls.
Building Production-Ready Agentic Workflows with AgentScope and ReAct Agents
Learn to build production-ready AgentScope workflows using ReAct agents, custom toolkits, and Pydantic for structured outputs. This tutorial demonstrates how to orchestrate multi-agent debates and concurrent analysis pipelines using OpenAI models to achieve high-fidelity reasoning and automated tool execution for enterprise-grade AI applications.
Designing a Multi-Tool Research Agent: Integrating Web Search, PDF Vision, and Automated Reporting
Build a Swiss Army Knife research agent that automates multi-step problems using tool-calling AI, vision-based chart analysis, and PDF ingestion to generate professional Markdown and DOCX reports.