LLM Code Understanding Evaluation - amosproj/amos2025ss04-ai-driven-testing GitHub Wiki
🎯 Goal
As a user, I want to know how well an LLM understands my code, so I can decide which LLM best fits my use case.
🚀 Result
LLMs demonstrate strong capabilities in generating code from descriptions; however, their ability to accurately describe code remains limited. This limitation becomes more pronounced in smaller models, which tend to produce lower-quality descriptions with less detail and accuracy.
✅ Summary of Test Results
🧠 Mistral
- ❌ 2/2 tests failed
- Error:
TypeError: Rule.__init__() missing 1 required positional argument: 'priority'
- Analysis:
Mistral used correct syntax and understood basic usage, but missed full constructor requirements.
➤ Partial understanding
class Rule:
def __init__(self, name, condition, action, priority, max_executions=None):
self.name = name
self.condition = condition
self.action = action
self.priority = priority
self.max_executions = max_executions if max_executions is not None else float('inf')
self.executions = 0
def is_applicable(self, context):
return self.condition(context) and self.executions < self.max_executions
def apply(self, context, log):
if self.is_applicable(context):
self.action(context)
self.executions += 1
log.append({'rule': self.name, 'context': context})
class RuleEngine:
def __init__(self, rules, sort_by_priority=True):
self.rules = [rule for rule in sorted(rules, key=lambda r: r.priority, reverse=sort_by_priority)]
self.log = []
def evaluate(self, context, max_iterations=None):
if max_iterations is None:
max_iterations = len(self.rules)
for rule in self.rules:
if rule.is_applicable(context):
rule.apply(context, self.log)
if len(self.log) >= max_iterations:
break
else:
continue
return self.log
def get_log(self):
return self.log
import unittest
class RuleTest(unittest.TestCase):
def setUp(self):
self.rule = Rule("test_rule", lambda ctx: True, lambda ctx: None)
def test_is_applicable(self):
# Testing is_applicable method
self.assertTrue(self.rule.is_applicable({}))
self.rule.executions = self.rule.max_executions + 1
self.assertFalse(self.rule.is_applicable({}))
def test_apply(self):
# Testing apply method
context = {}
log = []
self.rule.apply(context, log)
self.assertEqual(len(log), 1)
self.assertEqual(log[0], f"Rule '{self.rule.name}' executed")
self.assertEqual(self.rule.executions, 1)
class RuleEngineTest(unittest.TestCase):
def setUp(self):
self.rules = [
Rule("test_rule", lambda ctx: True, lambda ctx: None),
Rule("another_rule", lambda ctx: False, lambda ctx: None)
]
self.engine = RuleEngine(self.rules)
def test_evaluate(self):
# Testing evaluate method
context = {}
result = self.engine.evaluate(context)
self.assertEqual(result, {}) # No rule was applied since none were applicable
self.assertCountEqual(self.engine.get_log(), [])
context = {"some_key": "some_value"}
result = self.engine.evaluate(context)
self.assertEqual(result, {}) # Only test_rule was applied since it's the highest priority
self.assertCountEqual(self.engine.get_log(), ["Rule 'test_rule' executed"])
if __name__ == "__main__":
unittest.main()
🧠 Phi4
- ✅ 1/6 tests passed, ❌ 5 failed
- Errors:
- Used
Rule
withoutaction
parameter →TypeError
- Accessed undefined
execute_count
attribute →AttributeError
- Used
- Analysis:
Phi attempted more complex logic and integration, but missed key attributes and constructor arguments.
➤ Deeper attempt, but flawed internal understanding
class Rule:
def __init__(self, name, condition, action, priority=0, max_executions=1):
self.name = name
self.condition = condition
self.action = action
self.priority = priority
self.max_executions = max_executions
self.executions = 0
def is_applicable(self, context):
return (self.execute_count < self.max_executions and
self.condition(context))
def apply(self, context, log=None):
if not getattr(self, 'executions', 0) >= self.max_executions:
result = self.action()
if log is None: # initialize first time
global_log = []
self.log = lambda x: (global_log.append(x))
else:
self.log(f"Rule {self.name} executed with context {context}")
return True, result
class RuleEngine:
def __init__(self, rules):
sorted_rules = [rule for rule in sorted(rules, key=lambda r: -r.priority)]
self.rules = sorted_rules
self.log = []
def evaluate(self, context, max_iterations=10):
iteration_count = 0
while (iteration_count < max_iterations and
len([True if not getattr(rule, 'executions', None) >= rule.max_executions else False for rule in
self.rules]) > 0):
applicable_rules = [rule for rule in self.rules
if rule.is_applicable(context)]
executed_this_iteration = []
while (len(applicable_rules)) and (
len(executed_this_iteration) < max_iterations):
selected_rule = applicable_rules.pop(0)
execute_count = getattr(selected_rule, 'executions', 0)
try:
applied_successfully,
result = selected_rule.apply(context, self.log)
if not applied_successfully: # rule fails to apply so just continue with next iteration
continue
executed_this_iteration.append(selected_rule.name)
except Exception as e:
print(e.__str__())
application_log_entry = f"Iteration {iteration_count}: Executed rules {[r.name for r in executed_this_iteration]}."
self.log += [application_log_entry]
iteration_count += 1
return len([True if not getattr(rule, 'executions', None) >= rule.max_executions else False for rule in
self.rules]) == 0
def test_rule_initialization():
rule = Rule(name='TestRule',
condition=lambda context: True,
action=lambda context: print("Action executed"),
priority=1)
assert rule.name == 'TestRule'
assert callable(rule.condition)
assert callable(rule.action)
def test_rule_applicability():
context = {}
rule = Rule(name='AlwaysTrueRule',
condition=lambda ctx: True,
action=None)
assert isinstance(rule.is_applicable(context), bool) == True
def test_rule_engine_single_rules():
context = {}
r1 = Rule(name='Applicable', condition=lambda ctx: True,
action=lambda ctx: None)
r2 = Rule(name='NotApplicable', condition=lambda ctx: False,
max_executions=0)
engine = RuleEngine(rules=[r1, r2])
result_context = engine.evaluate(context)
assert isinstance(result_context, dict) # Assuming context is a dictionary that can change
def test_rule_engine_multiple_rules():
def rule1_action(ctx):
ctx['rule1'] = True
def rule2_condition(ctx):
return 'condition_met' in ctx and not ctx.get('rule1', False)
context = {}
r1 = Rule(name='HighPriorityRule',
condition=lambda ctx: 'priority_high' in ctx,
action=rule1_action, priority=10)
r2 = Rule(name='LowPriorityRule',
condition=rule2_condition,
max_executions=5)
engine = RuleEngine(rules=[r1, r2])
result_context = engine.evaluate(context)
def test_rule_engine_logging():
context = {}
def rule_action(ctx):
ctx['executed'] += 1
r = Rule(name='LoggingRule',
condition=lambda ctx: True,
action=rule_action, priority=5)
engine = RuleEngine(rules=[r])
result_context = engine.evaluate(context)
log_entries = ["Rule 'LoggingRule' executed"] * 10
assert ''.join(engine.get_log()) == '\n'.join(log_entries) + "\n"
def test_rule_engine_max_executions():
context = {}
def action(ctx):
ctx['executed'] += 1
r = Rule(name='MaxExcutedRule',
condition=lambda ctx: True,
action=action, priority=5, max_executions=3)
engine = RuleEngine(rules=[r])
for _ in range(4): # Attempting to execute the rule more times than allowed
result_context = engine.evaluate(context)
assert context['executed'] == 3
🧠 TINYLLAMA
- ❌ Syntax Error
- Error:
Rule(name='rule2', condition=lambda x: x < 5) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ SyntaxError: invalid syntax. Perhaps you forgot a comma?
# Define Rule and RuleEngine classes as before
from rule import Rule, RuleEngine
rules = [Rule(name='rule1', condition=lambda x: x > 0),
Rule(name='rule2', condition=lambda x: x < 5)
Rule(name='rule3', condition=lambda x: x % 2 == 0)]
engine = RuleEngine(rules) # Create new instance of RuleEngine class
context = {
'x': 1,
}
log = []
for rule in engine.get_log():
try:
context = engine.apply(context, log, log_level=10)
except NotApplicable:
print(f"Rule '{rule.name}' not applicable at context {context}")
print(f"Log after evaluating all rules:\n{log}")
from unittest.mock import Mock
class RuleFixture(fixtures.Fixture):
def __init__(self, rules):
self.rules = [Rule('name1', Mock(), lambda x: None), Rule('name2', Mock(), lambda x: None)]
@pytest.fixture()
def rule_engine():
return RuleEngine(RuleFixture(rules))
from unittest.mock import Mock
@pytest.fixture()
def rule_engine():
return RuleEngine(RuleFixture([Rule('name1', Mock(), lambda x: None), Rule('name2', Mock(), lambda x: None)]))
Prompts:
Generate unit tests
Generate tests for the following code:
class Rule:
def __init__(self, name, condition, action, priority=0, max_executions=1):
self.name = name
self.condition = condition # function: context -> bool
self.action = action # function: context -> None
self.priority = priority
self.max_executions = max_executions
self.executions = 0
def is_applicable(self, context):
return self.executions < self.max_executions and self.condition(context)
def apply(self, context, log):
self.action(context)
self.executions += 1
log.append(f"Rule '{self.name}' executed")
class RuleEngine:
def __init__(self, rules):
self.rules = sorted(rules, key=lambda r: r.priority, reverse=True)
self.log = []
def evaluate(self, context, max_iterations=10):
iteration = 0
while iteration < max_iterations:
applied = False
for rule in self.rules:
if rule.is_applicable(context):
rule.apply(context, self.log)
applied = True
break # Re-evaluate from the top due to possible context changes
if not applied:
break
iteration += 1
return context
def get_log(self):
return list(self.log)
Describe Code
The following code defines two classes. Please describe their purpose, what each method does, and how they work together.
class Rule:
def __init__(self, name, condition, action, priority=0, max_executions=1):
self.name = name
self.condition = condition # function: context -> bool
self.action = action # function: context -> None
self.priority = priority
self.max_executions = max_executions
self.executions = 0
def is_applicable(self, context):
return self.executions < self.max_executions and self.condition(context)
def apply(self, context, log):
self.action(context)
self.executions += 1
log.append(f"Rule '{self.name}' executed")
class RuleEngine:
def __init__(self, rules):
self.rules = sorted(rules, key=lambda r: r.priority, reverse=True)
self.log = []
def evaluate(self, context, max_iterations=10):
iteration = 0
while iteration < max_iterations:
applied = False
for rule in self.rules:
if rule.is_applicable(context):
rule.apply(context, self.log)
applied = True
break # Re-evaluate from the top due to possible context changes
if not applied:
break
iteration += 1
return context
def get_log(self):
return list(self.log)
Code from description
give me the full code from this description