diff --git a/classes/local/parser.php b/classes/local/parser.php index c198b480..eec95421 100644 --- a/classes/local/parser.php +++ b/classes/local/parser.php @@ -33,7 +33,7 @@ class parser { protected array $tokenlist; /** @var int number of (raw) tokens */ - private int $count; + protected int $count; /** @var int position w.r.t. list of (raw) tokens */ private int $position = -1; @@ -98,7 +98,7 @@ private function parse_the_right_thing(token $token) { * * @return void */ - private function check_unbalanced_parens(): void { + protected function check_unbalanced_parens(): void { $parenstack = []; foreach ($this->tokenlist as $token) { $type = $token->type; @@ -399,7 +399,7 @@ private function peek(int $skip = 0): ?token { * * @return token|null */ - private function read_next(): ?token { + protected function read_next(): ?token { $nexttoken = $this->peek(); if ($nexttoken !== self::EOF) { $this->position++; diff --git a/classes/local/shunting_yard.php b/classes/local/shunting_yard.php index da309966..2ffd79dc 100644 --- a/classes/local/shunting_yard.php +++ b/classes/local/shunting_yard.php @@ -528,6 +528,133 @@ public static function infix_to_rpn(array $tokens): array { return $output; } + /** + * Translate unit expression from infix into RPN notation via Dijkstra's shunting yard algorithm, + * because this makes evaluation much easier. + * + * @param array $tokens the tokens forming the expression that is to be translated + * @return array + */ + public static function unit_infix_to_rpn($tokens): array { + $output = []; + $opstack = []; + + $lasttoken = null; + $lasttype = null; + $lastvalue = null; + foreach ($tokens as $token) { + $type = $token->type; + $value = $token->value; + + if (!is_null($lasttoken)) { + $lasttype = $lasttoken->type; + $lastvalue = $lasttoken->value; + } + + // Insert inplicit multiplication sign between two consecutive UNIT tokens. + // For accurate error reporting, the row and column number of the implicit + // multiplication token are copied over from the current token which triggered + // the multiplication. + $unitunit = ($lasttype === token::UNIT && $type === token::UNIT); + $unitparen = ($lasttype === token::UNIT && $type === token::OPENING_PAREN); + $parenunit = ($lasttype === token::CLOSING_PAREN && $type === token::UNIT); + $parenparen = ($lasttype === token::CLOSING_PAREN && $type === token::OPENING_PAREN); + if ($unitunit || $unitparen || $parenunit || $parenparen) { + // For backwards compatibility, division will have a lower precedence than multiplication, + // in order for J / m K to be interpreted as J / (m K). Instead of introducing a special + // 'unit multiplication' pseudo-operator, we simply increase the multiplication's precedence + // by one when flushing operators from the opstack. + self::flush_higher_precedence($opstack, self::get_precedence('*') + 1, $output); + $opstack[] = new token(token::OPERATOR, '*', $token->row, $token->column); + } + + // Two consecutive operators are only possible if the unary minus follows exponentiation. + // Note: We do not have to check whether the first of them is exponentiation, because we + // only allow - in the exponent anyway. + if ($type === token::OPERATOR && $lasttype === token::OPERATOR && $value !== '-') { + self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token); + } + + switch ($type) { + // UNIT tokens go straight to the output queue. + case token::UNIT: + $output[] = $token; + break; + + // Numbers go to the output queue. + case token::NUMBER: + // If the last token was the unary minus, we multiply the number by -1 before + // sending it to the output queue. Afterwards, we can remove the minus from the opstack. + if ($lasttype === token::OPERATOR && $lastvalue === '-') { + $token->value = -$token->value; + array_pop($opstack); + } + $output[] = $token; + break; + + // Opening parentheses go straight to the operator stack. + case token::OPENING_PAREN: + $opstack[] = $token; + break; + + // A closing parenthesis means we flush all operators until we get to the + // matching opening parenthesis. + case token::CLOSING_PAREN: + // A closing parenthesis must not occur immediately after an operator. + if ($lasttype === token::OPERATOR) { + self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token); + } + self::flush_until_paren($opstack, token::OPENING_PAREN, $output); + break; + + // Deal with all the possible operators... + case token::OPERATOR: + // Expressions must not start with an operator. + if (is_null($lasttoken)) { + self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token); + } + // Operators must not follow an opening parenthesis, except for the unary minus. + if ($lasttype === token::OPENING_PAREN && $value !== '-') { + self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token); + } + // Before fetching the precedence, we must translate ^ (caret) into **, because + // the ^ operator normally has a different meaning with lower precedence. + if ($value === '^') { + $value = '**'; + } + $thisprecedence = self::get_precedence($value); + // We artificially increase the precedence of the division operator, because + // legacy versions used implicit parens around the denominator, e. g. + // the expression J / m K would be interpreted as J / (m * K). This is consistent + // with what tools like Wolfram Alpha do, even though e. g. 1 / 2 3 would be read + // as 3/2 both by Formulas Question and Wolfram Alpha. And even if it were not, it + // is not possible to change that, because it could break existing questions. + if ($value === '*') { + $thisprecedence++; + } + // Flush operators with higher precedence, unless we have a unary minus, because + // it is not left-associative. + if ($value !== '-') { + self::flush_higher_precedence($opstack, $thisprecedence, $output); + } + // Put the operator on the stack. + $opstack[] = $token; + break; + + // If we still haven't dealt with the token, there must be a problem with the input. + default: + self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token); + + } + + $lasttoken = $token; + } + // After last token, flush opstack. Last token must be either a number (in exponent), + // a closing parenthesis or a unit. + self::flush_all($opstack, $output); + return $output; + } + /** * Stop processing and indicate the human readable position (row/column) where the error occurred. * diff --git a/classes/local/token.php b/classes/local/token.php index 4e34c8c0..922dec20 100644 --- a/classes/local/token.php +++ b/classes/local/token.php @@ -123,6 +123,9 @@ class token { /** @var int used to designate a token storing an end-of-group marker (closing brace) */ const END_GROUP = 4194304; + /** @var int used to designate a token storing a unit */ + const UNIT = 8388608; + /** @var mixed the token's content, will be the name for identifiers */ public $value; diff --git a/classes/local/unit_parser.php b/classes/local/unit_parser.php new file mode 100644 index 00000000..0b170817 --- /dev/null +++ b/classes/local/unit_parser.php @@ -0,0 +1,256 @@ +. + +namespace qtype_formulas\local; + +/* + +Notes about current implementation: + +- only 2 operators allowed: ^ for exponentiation and / for division +- only one / allowed +- no * allowed +- no parens allowed, except in exponent or around the *entire* denominator +- right side of / is considered in parens, even if not written, e.g. J/m*K --> J / (m*K) +- only units, no numbers except for exponents +- positive or negative exponents allowed +- negative exponents allowed with or without parens +- same unit not allowed more than once + +Future implementation, must be 100% backwards compatible + +- allow parens everywhere +- allow * for explicit multiplication of units +- still only allow one / +- still not allow same unit more than once +- if * is used after /, assume implicit parens, e. g. J / m * K --> J / (m * K) +- do not allow operators other than *, / and ^ as well as unary - (in exponents only) +- allow ** instead of ^ + +*/ + + +/** + * Parser for units for qtype_formulas + * + * @package qtype_formulas + * @copyright 2025 Philipp Imhof + * @license https://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class unit_parser extends parser { + + /** @var array list of used units */ + private array $unitlist = []; + + /** + * Create a unit parser class and have it parse a given input. The input can be given as a string, in + * which case it will first be sent to the lexer. If that step has already been made, the constructor + * also accepts a list of tokens. + * + * @param string|array $tokenlist list of tokens as returned from the lexer or input string + */ + public function __construct($tokenlist) { + // If the input is given as a string, run it through the lexer first. + if (is_string($tokenlist)) { + $lexer = new lexer($tokenlist); + $tokenlist = $lexer->get_tokens(); + } + $this->tokenlist = $tokenlist; + $this->count = count($tokenlist); + + // Check for unbalanced / mismatched parentheses. + $this->check_parens(); + + // Whether we have already seen a slash or the number one (except in exponents). + $seenslash = false; + $seenunit = false; + $inexponent = false; + foreach ($tokenlist as $token) { + // The use of functions is not permitted in units, so all identifiers will be classified + // as UNIT tokens. + if ($token->type === token::IDENTIFIER) { + // If inside an exponent, only numbers (and maybe the unary minus) are allowed. + if ($inexponent) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + // The same unit must not be used more than once. + if ($this->has_unit_been_used($token)) { + $this->die('Unit already used: ' . $token->value, $token); + } + $this->unitlist[] = $token->value; + $token->type = token::UNIT; + $seenunit = true; + continue; + } + + // Do various syntax checks for operators. + if ($token->type === token::OPERATOR) { + // We can only accept an operator if there has been at least one unit before. + if (!$seenunit) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + // The only operators allowed are exponentiation, multiplication, division and the unary minus. + // Note that the caret (^) always means exponentiation in the context of units. + if (!in_array($token->value, ['^', '**', '/', '*', '-'])) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + // The unary minus is only allowed inside an exponent. + if ($token->value === '-' && !$inexponent) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + // Only the unary minus is allowed inside an exponent. + if ($inexponent && $token->value !== '-') { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + if ($token->value === '^' || $token->value === '**') { + $inexponent = true; + } + if ($token->value === '/') { + if ($seenslash) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + $seenslash = true; + } + continue; + } + + // Numbers can only be used as exponents and exponents must always be integers. + if ($token->type === token::NUMBER) { + if (!$inexponent) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + if (intval($token->value) != $token->value) { + $this->die(get_string('error_integerexpected', 'qtype_formulas', $token->value), $token); + } + // Only one number is allowed in an exponent, so after the number the + // exponent must be finished. + $inexponent = false; + continue; + } + + // Parentheses are allowed, but we don't have to do anything with them now. + if (in_array($token->type, [token::OPENING_PAREN, token::CLOSING_PAREN])) { + continue; + } + + // All other tokens are not allowed. + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + + // The last token must be a number, a unit or a closing parenthesis. + $finaltoken = end($tokenlist); + if (!in_array($finaltoken->type, [token::UNIT, token::NUMBER, token::CLOSING_PAREN])) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + + $this->statements[] = shunting_yard::unit_infix_to_rpn($this->tokenlist); + } + + /** + * Check whether a given unit has already been used. + * + * @param token $token token containing the unit + * @return bool + */ + protected function has_unit_been_used(token $token): bool { + return in_array($token->value, $this->unitlist); + } + + /** + * Check whether all parentheses are balanced and whether only round parens are used. + * Otherweise, stop all further processing and output an error message. + * + * @return void + */ + protected function check_parens(): void { + $parenstack = []; + foreach ($this->tokenlist as $token) { + $type = $token->type; + // We only allow round parens. + if (($token->type & token::ANY_PAREN) && !($token->type & token::OPEN_OR_CLOSE_PAREN)) { + $this->die(get_string('error_unexpectedtoken', 'qtype_formulas', $token->value), $token); + } + if ($type === token::OPENING_PAREN) { + $parenstack[] = $token; + } + if ($type === token::CLOSING_PAREN) { + $top = end($parenstack); + // If stack is empty, we have a stray closing paren. + if (!($top instanceof token)) { + $this->die(get_string('error_strayparen', 'qtype_formulas', $token->value), $token); + } + array_pop($parenstack); + } + } + // If the stack of parentheses is not empty now, we have an unmatched opening parenthesis. + if (!empty($parenstack)) { + $unmatched = end($parenstack); + $this->die(get_string('error_parennotclosed', 'qtype_formulas', $unmatched->value), $unmatched); + } + } + + /** + * Translate the given input into a string that can be understood by the legacy unit parser, i. e. + * following all syntax rules. This allows keeping the old unit conversion system in place until + * we are readyd to eventually replace it. + * + * @return string + */ + public function get_legacy_unit_string(): string { + $stack = []; + + foreach ($this->statements[0] as $token) { + // Write numbers and units to the stack. + if (in_array($token->type, [token::UNIT, token::NUMBER])) { + $value = $token->value; + if (is_numeric($value) && $value < 0) { + $value = '(' . strval($value) . ')'; + } + $stack[] = $value; + } + + // Operators take arguments from stack and stick them together in the appropriate way. + if ($token->type === token::OPERATOR) { + $op = $token->value; + if ($op === '**') { + $op = '^'; + } + if ($op === '*') { + $op = ' '; + } + $second = array_pop($stack); + $first = array_pop($stack); + // With the new syntax, it is possible to write e.g. (m/s^2)*kg. In older versions, + // everything coming after the / operator will be considered a part of the denominator, + // so the only way to get the kg into the numerator is to reorder the units and + // write them as kg*m/s^2. Long story short: if there is a division, it must come last. + // Note that the syntax currently does not allow more than one /, so we do not need + // a more sophisticated solution. + if (strpos($first, '/') !== false) { + list($second, $first) = [$first, $second]; + } + // Legacy syntax allowed parens around the entire denominator, so we do that unless the + // denominator is just one unit. + if ($op === '/' && !preg_match('/^[A-Za-z]+$/', $second)) { + $second = '(' . $second . ')'; + } + $stack[] = $first . $op . $second; + } + } + + return implode('', $stack); + } +} diff --git a/lang/en/qtype_formulas.php b/lang/en/qtype_formulas.php index 029f9afc..d7dd165e 100644 --- a/lang/en/qtype_formulas.php +++ b/lang/en/qtype_formulas.php @@ -165,6 +165,7 @@ $string['error_import_missing_field'] = 'Import error. Missing field: {$a} '; $string['error_in_answer'] = 'Error in answer #{$a->answerno}: {$a->message}'; $string['error_indexoutofrange'] = 'Evaluation error: index {$a} out of range.'; +$string['error_integerexpected'] = 'Syntax error: integer expected, found {$a} instead.'; $string['error_inv_consec'] = 'When using inv(), the numbers in the list must be consecutive.'; $string['error_inv_integers'] = 'inv() expects all elements of the list to be integers; floats will be truncated.'; $string['error_inv_list'] = 'inv() expects a list.'; diff --git a/tests/unit_parser_test.php b/tests/unit_parser_test.php new file mode 100644 index 00000000..cbc09031 --- /dev/null +++ b/tests/unit_parser_test.php @@ -0,0 +1,176 @@ +. + +namespace qtype_formulas; + +defined('MOODLE_INTERNAL') || die(); + +global $CFG; +require_once($CFG->dirroot . '/question/type/formulas/questiontype.php'); + +use Exception; +use qtype_formulas\local\unit_parser; + +/** + * Unit tests for the unit_parser class. + * + * @package qtype_formulas + * @category test + * @copyright 2025 Philipp Imhof + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + * + * @covers \qtype_formulas\local\unit_parser + */ +final class unit_parser_test extends \advanced_testcase { + + public function test_parse_unit_FIXME_REMOVE_WHEN_FINISHED() { + self::assertTrue(true); + return; + $input = '(m/s)^2'; + $parser = new unit_parser($input); + var_dump($parser->get_statements()[0]); + + echo $parser->get_legacy_unit_string(); + } + + /** + * Test parsing of unit inputs. + * + * @dataProvider provide_units + */ + public function test_parse_unit($expected, $input) { + $e = null; + $error = ''; + try { + new unit_parser($input); + } catch (Exception $e) { + $error = $e->getMessage(); + } + + // If we are expecting an error message, the exception object should not be null and + // the message should match, without checking row and column number. + if ($expected[0] === '!') { + self::assertNotNull($e); + self::assertStringEndsWith(substr($expected, 1), $error); + } else { + self::assertNull($e); + } + } + + /** + * Test conversion of unit inputs to legacy input format. + * + * @dataProvider provide_units + */ + public function test_get_legacy_unit_string($expected, $input) { + $e = null; + try { + $parser = new unit_parser($input); + } catch (Exception $e) { + $e->getMessage(); + } + + // If we are not expecting an error, check that the input has been translated as expected. + if ($expected[0] !== '!') { + self::assertEquals($expected, $parser->get_legacy_unit_string()); + } else { + self::assertNotNull($e); + } + } + + /** + * Data provider for the test functions. For simplicity, we use the same provider + * for valid and invalid expressions. In case of invalid expressions, we put an + * exclamation mark (!) at the start of the error message. + * + * @return array + */ + public static function provide_units(): array { + return [ + ['J/(m K)', 'J / m K'], + ['J/(m K)', 'J / m*K'], + ['J/(m K)', 'J / (m K)'], + ['J/(m K)', 'J / (m*K)'], + ['m kg/(s^2)', 'm kg/s^2'], + ['m kg/(s^2)', 'm kg / s^2'], + ['m kg/(s^2)', 'm*kg / s^2'], + ['m kg/(s^2)', 'm*(kg / s^2)'], + ['kg m/(s^2)', '(m/s^2)*kg'], + ['kg m/(s^2)', '(m/s^2) kg'], + ['m kg/(s^2)', '(m (kg / s^(2)))'], + ['m K kg/s', 'm (kg / s) K'], + ['s^(-1)', 's^-1'], + ['s^2', 's**2'], + ['s^(-1)', 's**-1'], + ['s^(-1)', 's^(-1)'], + ['s^(-1)', 's**(-1)'], + ['s^(-1)/(m^(-1))', 's**-1 / m**-1'], + ['m', 'm'], + ['m', '(m)'], + ['km', 'km'], + ['m^2', 'm^2'], + ['m^2', 'm^(2)'], + ['m^2', '(m^2)'], + ['m^2', 'm**2'], + ['m^2', '(m**2)'], + ['m^2', 'm**(2)'], + ['m^2', 'm ^ 2'], + ['m^2', 'm ^ (2)'], + ['m^2', 'm ** 2'], + ['m^2', 'm ** (2)'], + ['m^(-2)', 'm^-2'], + ['m^(-2)', '(m^-2)'], + ['m^(-2)', 'm^(-2)'], + ['m^(-2)', 'm ^ -2'], + ['m^(-2)', 'm ^ (-2)'], + ['m/s', 'm/s'], + ['m/s', '(m)/(s)'], + ['m/s', '(m/s)'], + ['m s^(-1)', 'm s^-1'], + ['m s^(-1)', 'm (s^-1)'], + ['m s^(-1)', 'm (s^(-1))'], + ['m s^(-1)', 'm s^(-1)'], + ['m/(s^(-1))', 'm / (s^(-1))'], + ['m/(s^(-1))', 'm / ((s^(-1)))'], + ['kg m/s', 'kg m/s'], + ['kg m/s', 'kg (m/s)'], + ['kg m/s', 'kg*(m/s)'], + ['kg m/s', 'kg*m/s'], + ['kg m/s', '(kg m)/s'], + ['kg m/s', '(kg*m)/s'], + ['kg m s^(-1)', 'kg m s^-1'], + ['kg m^2', 'kg m^2'], + ['kg m^2', 'kg m ^ 2'], + ['kg m s^(-1)', 'kg m s ^ - 1'], + ['!Unit already used: m', 'm kg / m'], + ['!Unexpected token: 1', 'm 1/s'], + ['!Unexpected token: 1', '1/s'], + ['!Unexpected token: 1', '1 m/s'], + ['!Unexpected token: 2', '2/s'], + ['!Unexpected token: 2.1', '2.1'], + ['!Unexpected token: ^', '^2'], + ['!Unexpected token: *', '*s'], + ['!Unexpected token: *', 'm* *kg'], + ['!Unexpected token: /', '/s'], + ['!Unexpected token: *', 'm*'], + ['!Unexpected token: /', 'm/'], + ['!Unexpected token: ^', 'm^'], + ['!Unexpected token: /', 'm^(/2)'], + ['!Unexpected token: +', 'm^+2'], + ["!Unexpected input: '@'", '@'], + ]; + } +} diff --git a/version.php b/version.php index 7d5ff3a9..d55c6d91 100644 --- a/version.php +++ b/version.php @@ -25,7 +25,7 @@ defined('MOODLE_INTERNAL') || die(); $plugin->component = 'qtype_formulas'; -$plugin->version = 2025021400; +$plugin->version = 2025021402; $plugin->cron = 0; $plugin->requires = 2022112800;