LLM Code Understanding Evaluation - amosproj/amos2025ss04-ai-driven-testing GitHub Wiki

🎯 Goal

As a user, I want to know how well an LLM understands my code, so I can decide which LLM best fits my use case.

🚀 Result

LLMs demonstrate strong capabilities in generating code from descriptions; however, their ability to accurately describe code remains limited. This limitation becomes more pronounced in smaller models, which tend to produce lower-quality descriptions with less detail and accuracy.

✅ Summary of Test Results

🧠 Mistral

  • 2/2 tests failed
  • Error:
    TypeError: Rule.__init__() missing 1 required positional argument: 'priority'
  • Analysis:
    Mistral used correct syntax and understood basic usage, but missed full constructor requirements.
    Partial understanding
class Rule:
    def __init__(self, name, condition, action, priority, max_executions=None):
        self.name = name
        self.condition = condition
        self.action = action
        self.priority = priority
        self.max_executions = max_executions if max_executions is not None else float('inf')
        self.executions = 0

    def is_applicable(self, context):
        return self.condition(context) and self.executions < self.max_executions

    def apply(self, context, log):
        if self.is_applicable(context):
            self.action(context)
            self.executions += 1
            log.append({'rule': self.name, 'context': context})

class RuleEngine:
    def __init__(self, rules, sort_by_priority=True):
        self.rules = [rule for rule in sorted(rules, key=lambda r: r.priority, reverse=sort_by_priority)]
        self.log = []

    def evaluate(self, context, max_iterations=None):
        if max_iterations is None:
            max_iterations = len(self.rules)

        for rule in self.rules:
            if rule.is_applicable(context):
                rule.apply(context, self.log)
                if len(self.log) >= max_iterations:
                    break
            else:
                continue
        return self.log

    def get_log(self):
        return self.log
import unittest

class RuleTest(unittest.TestCase):
    def setUp(self):
        self.rule = Rule("test_rule", lambda ctx: True, lambda ctx: None)

    def test_is_applicable(self):
        # Testing is_applicable method
        self.assertTrue(self.rule.is_applicable({}))
        self.rule.executions = self.rule.max_executions + 1
        self.assertFalse(self.rule.is_applicable({}))

    def test_apply(self):
        # Testing apply method
        context = {}
        log = []
        self.rule.apply(context, log)
        self.assertEqual(len(log), 1)
        self.assertEqual(log[0], f"Rule '{self.rule.name}' executed")
        self.assertEqual(self.rule.executions, 1)

class RuleEngineTest(unittest.TestCase):
    def setUp(self):
        self.rules = [
            Rule("test_rule", lambda ctx: True, lambda ctx: None),
            Rule("another_rule", lambda ctx: False, lambda ctx: None)
        ]
        self.engine = RuleEngine(self.rules)

    def test_evaluate(self):
        # Testing evaluate method
        context = {}
        result = self.engine.evaluate(context)
        self.assertEqual(result, {})  # No rule was applied since none were applicable
        self.assertCountEqual(self.engine.get_log(), [])

        context = {"some_key": "some_value"}
        result = self.engine.evaluate(context)
        self.assertEqual(result, {})  # Only test_rule was applied since it's the highest priority
        self.assertCountEqual(self.engine.get_log(), ["Rule 'test_rule' executed"])

if __name__ == "__main__":
    unittest.main()

🧠 Phi4

  • 1/6 tests passed, ❌ 5 failed
  • Errors:
    • Used Rule without action parameter → TypeError
    • Accessed undefined execute_count attribute → AttributeError
  • Analysis:
    Phi attempted more complex logic and integration, but missed key attributes and constructor arguments.
    Deeper attempt, but flawed internal understanding
class Rule:
    def __init__(self, name, condition, action, priority=0, max_executions=1):
        self.name = name
        self.condition = condition
        self.action = action
        self.priority = priority
        self.max_executions = max_executions
        self.executions = 0

    def is_applicable(self, context):
        return (self.execute_count < self.max_executions and
                self.condition(context))

    def apply(self, context, log=None):
        if not getattr(self, 'executions', 0) >= self.max_executions:
            result = self.action()
            if log is None:  # initialize first time
                global_log = []
                self.log = lambda x: (global_log.append(x))
            else:
                self.log(f"Rule {self.name} executed with context {context}")
            return True, result
class RuleEngine:
    def __init__(self, rules):
        sorted_rules = [rule for rule in sorted(rules, key=lambda r: -r.priority)]
        self.rules = sorted_rules
        self.log = []

    def evaluate(self, context, max_iterations=10):
        iteration_count = 0

        while (iteration_count < max_iterations and
               len([True if not getattr(rule, 'executions', None) >= rule.max_executions else False for rule in
                    self.rules]) > 0):

            applicable_rules = [rule for rule in self.rules
                                if rule.is_applicable(context)]

            executed_this_iteration = []

            while (len(applicable_rules)) and (
                    len(executed_this_iteration) < max_iterations):
                selected_rule = applicable_rules.pop(0)
                execute_count = getattr(selected_rule, 'executions', 0)

                try:
                    applied_successfully,
                    result = selected_rule.apply(context, self.log)

                    if not applied_successfully:  # rule fails to apply so just continue with next iteration
                        continue

                    executed_this_iteration.append(selected_rule.name)
                except Exception as e:
                    print(e.__str__())

            application_log_entry = f"Iteration {iteration_count}: Executed rules {[r.name for r in executed_this_iteration]}."
            self.log += [application_log_entry]

            iteration_count += 1

        return len([True if not getattr(rule, 'executions', None) >= rule.max_executions else False for rule in
                    self.rules]) == 0
def test_rule_initialization():
    rule = Rule(name='TestRule',
                condition=lambda context: True,
                action=lambda context: print("Action executed"),
                priority=1)

    assert rule.name == 'TestRule'
    assert callable(rule.condition)
    assert callable(rule.action)


def test_rule_applicability():
    context = {}
    rule = Rule(name='AlwaysTrueRule',
                condition=lambda ctx: True,
                action=None)

    assert isinstance(rule.is_applicable(context), bool) == True


def test_rule_engine_single_rules():
    context = {}
    r1 = Rule(name='Applicable', condition=lambda ctx: True,
              action=lambda ctx: None)
    r2 = Rule(name='NotApplicable', condition=lambda ctx: False,
              max_executions=0)

    engine = RuleEngine(rules=[r1, r2])
    result_context = engine.evaluate(context)

    assert isinstance(result_context, dict)  # Assuming context is a dictionary that can change


def test_rule_engine_multiple_rules():
    def rule1_action(ctx):
        ctx['rule1'] = True

    def rule2_condition(ctx):
        return 'condition_met' in ctx and not ctx.get('rule1', False)

    context = {}

    r1 = Rule(name='HighPriorityRule',
              condition=lambda ctx: 'priority_high' in ctx,
              action=rule1_action, priority=10)

    r2 = Rule(name='LowPriorityRule',
              condition=rule2_condition,
              max_executions=5)

    engine = RuleEngine(rules=[r1, r2])

    result_context = engine.evaluate(context)


def test_rule_engine_logging():
    context = {}

    def rule_action(ctx):
        ctx['executed'] += 1

    r = Rule(name='LoggingRule',
             condition=lambda ctx: True,
             action=rule_action, priority=5)

    engine = RuleEngine(rules=[r])

    result_context = engine.evaluate(context)

    log_entries = ["Rule 'LoggingRule' executed"] * 10
    assert ''.join(engine.get_log()) == '\n'.join(log_entries) + "\n"


def test_rule_engine_max_executions():
    context = {}

    def action(ctx):
        ctx['executed'] += 1

    r = Rule(name='MaxExcutedRule',
             condition=lambda ctx: True,
             action=action, priority=5, max_executions=3)

    engine = RuleEngine(rules=[r])

    for _ in range(4):  # Attempting to execute the rule more times than allowed
        result_context = engine.evaluate(context)

    assert context['executed'] == 3

🧠 TINYLLAMA

  • Syntax Error
  • Error:
    Rule(name='rule2', condition=lambda x: x < 5)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    SyntaxError: invalid syntax. Perhaps you forgot a comma?
    
# Define Rule and RuleEngine classes as before
from rule import Rule, RuleEngine

rules = [Rule(name='rule1', condition=lambda x: x > 0),
          Rule(name='rule2', condition=lambda x: x < 5)
          Rule(name='rule3', condition=lambda x: x % 2 == 0)]

engine = RuleEngine(rules) # Create new instance of RuleEngine class
context = {
     'x': 1,
}
log = []
for rule in engine.get_log():
     try:
         context = engine.apply(context, log, log_level=10)
     except NotApplicable:
         print(f"Rule '{rule.name}' not applicable at context {context}")

print(f"Log after evaluating all rules:\n{log}")
from unittest.mock import Mock

class RuleFixture(fixtures.Fixture):
    def __init__(self, rules):
        self.rules = [Rule('name1', Mock(), lambda x: None), Rule('name2', Mock(), lambda x: None)]

@pytest.fixture()
def rule_engine():
    return RuleEngine(RuleFixture(rules))

from unittest.mock import Mock

@pytest.fixture()
def rule_engine():
    return RuleEngine(RuleFixture([Rule('name1', Mock(), lambda x: None), Rule('name2', Mock(), lambda x: None)]))

Prompts:

Generate unit tests

Generate tests for the following code:

class Rule:
    def __init__(self, name, condition, action, priority=0, max_executions=1):
        self.name = name
        self.condition = condition  # function: context -> bool
        self.action = action        # function: context -> None
        self.priority = priority
        self.max_executions = max_executions
        self.executions = 0

    def is_applicable(self, context):
        return self.executions < self.max_executions and self.condition(context)

    def apply(self, context, log):
        self.action(context)
        self.executions += 1
        log.append(f"Rule '{self.name}' executed")

class RuleEngine:
    def __init__(self, rules):
        self.rules = sorted(rules, key=lambda r: r.priority, reverse=True)
        self.log = []

    def evaluate(self, context, max_iterations=10):
        iteration = 0
        while iteration < max_iterations:
            applied = False
            for rule in self.rules:
                if rule.is_applicable(context):
                    rule.apply(context, self.log)
                    applied = True
                    break  # Re-evaluate from the top due to possible context changes
            if not applied:
                break
            iteration += 1
        return context

    def get_log(self):
        return list(self.log)

Describe Code

The following code defines two classes. Please describe their purpose, what each method does, and how they work together.

class Rule:
    def __init__(self, name, condition, action, priority=0, max_executions=1):
        self.name = name
        self.condition = condition  # function: context -> bool
        self.action = action        # function: context -> None
        self.priority = priority
        self.max_executions = max_executions
        self.executions = 0

    def is_applicable(self, context):
        return self.executions < self.max_executions and self.condition(context)

    def apply(self, context, log):
        self.action(context)
        self.executions += 1
        log.append(f"Rule '{self.name}' executed")

class RuleEngine:
    def __init__(self, rules):
        self.rules = sorted(rules, key=lambda r: r.priority, reverse=True)
        self.log = []

    def evaluate(self, context, max_iterations=10):
        iteration = 0
        while iteration < max_iterations:
            applied = False
            for rule in self.rules:
                if rule.is_applicable(context):
                    rule.apply(context, self.log)
                    applied = True
                    break  # Re-evaluate from the top due to possible context changes
            if not applied:
                break
            iteration += 1
        return context

    def get_log(self):
        return list(self.log)

Code from description

give me the full code from this description