Skip to content

Commit d7df1b6

Browse files
committed
first step for unit parser
1 parent 0db3c1b commit d7df1b6

File tree

7 files changed

+567
-4
lines changed

7 files changed

+567
-4
lines changed

classes/local/parser.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class parser {
3333
protected array $tokenlist;
3434

3535
/** @var int number of (raw) tokens */
36-
private int $count;
36+
protected int $count;
3737

3838
/** @var int position w.r.t. list of (raw) tokens */
3939
private int $position = -1;
@@ -98,7 +98,7 @@ private function parse_the_right_thing(token $token) {
9898
*
9999
* @return void
100100
*/
101-
private function check_unbalanced_parens(): void {
101+
protected function check_unbalanced_parens(): void {
102102
$parenstack = [];
103103
foreach ($this->tokenlist as $token) {
104104
$type = $token->type;
@@ -400,7 +400,7 @@ private function peek(int $skip = 0): ?token {
400400
*
401401
* @return token|null
402402
*/
403-
private function read_next(): ?token {
403+
protected function read_next(): ?token {
404404
$nexttoken = $this->peek();
405405
if ($nexttoken !== self::EOF) {
406406
$this->position++;

classes/local/shunting_yard.php

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,133 @@ public static function infix_to_rpn(array $tokens): array {
528528
return $output;
529529
}
530530

531+
/**
532+
* Translate unit expression from infix into RPN notation via Dijkstra's shunting yard algorithm,
533+
* because this makes evaluation much easier.
534+
*
535+
* @param array $tokens the tokens forming the expression that is to be translated
536+
* @return array
537+
*/
538+
public static function unit_infix_to_rpn($tokens): array {
539+
$output = [];
540+
$opstack = [];
541+
542+
$lasttoken = null;
543+
$lasttype = null;
544+
$lastvalue = null;
545+
foreach ($tokens as $token) {
546+
$type = $token->type;
547+
$value = $token->value;
548+
549+
if (!is_null($lasttoken)) {
550+
$lasttype = $lasttoken->type;
551+
$lastvalue = $lasttoken->value;
552+
}
553+
554+
// Insert inplicit multiplication sign between two consecutive UNIT tokens.
555+
// For accurate error reporting, the row and column number of the implicit
556+
// multiplication token are copied over from the current token which triggered
557+
// the multiplication.
558+
$unitunit = ($lasttype === token::UNIT && $type === token::UNIT);
559+
$unitparen = ($lasttype === token::UNIT && $type === token::OPENING_PAREN);
560+
$parenunit = ($lasttype === token::CLOSING_PAREN && $type === token::UNIT);
561+
$parenparen = ($lasttype === token::CLOSING_PAREN && $type === token::OPENING_PAREN);
562+
if ($unitunit || $unitparen || $parenunit || $parenparen) {
563+
// For backwards compatibility, division will have a lower precedence than multiplication,
564+
// in order for J / m K to be interpreted as J / (m K). Instead of introducing a special
565+
// 'unit multiplication' pseudo-operator, we simply increase the multiplication's precedence
566+
// by one when flushing operators from the opstack.
567+
self::flush_higher_precedence($opstack, self::get_precedence('*') + 1, $output);
568+
$opstack[] = new token(token::OPERATOR, '*', $token->row, $token->column);
569+
}
570+
571+
// Two consecutive operators are only possible if the unary minus follows exponentiation.
572+
// Note: We do not have to check whether the first of them is exponentiation, because we
573+
// only allow - in the exponent anyway.
574+
if ($type === token::OPERATOR && $lasttype === token::OPERATOR && $value !== '-') {
575+
self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token);
576+
}
577+
578+
switch ($type) {
579+
// UNIT tokens go straight to the output queue.
580+
case token::UNIT:
581+
$output[] = $token;
582+
break;
583+
584+
// Numbers go to the output queue.
585+
case token::NUMBER:
586+
// If the last token was the unary minus, we multiply the number by -1 before
587+
// sending it to the output queue. Afterwards, we can remove the minus from the opstack.
588+
if ($lasttype === token::OPERATOR && $lastvalue === '-') {
589+
$token->value = -$token->value;
590+
array_pop($opstack);
591+
}
592+
$output[] = $token;
593+
break;
594+
595+
// Opening parentheses go straight to the operator stack.
596+
case token::OPENING_PAREN:
597+
$opstack[] = $token;
598+
break;
599+
600+
// A closing parenthesis means we flush all operators until we get to the
601+
// matching opening parenthesis.
602+
case token::CLOSING_PAREN:
603+
// A closing parenthesis must not occur immediately after an operator.
604+
if ($lasttype === token::OPERATOR) {
605+
self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token);
606+
}
607+
self::flush_until_paren($opstack, token::OPENING_PAREN, $output);
608+
break;
609+
610+
// Deal with all the possible operators...
611+
case token::OPERATOR:
612+
// Expressions must not start with an operator.
613+
if (is_null($lasttoken)) {
614+
self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token);
615+
}
616+
// Operators must not follow an opening parenthesis, except for the unary minus.
617+
if ($lasttype === token::OPENING_PAREN && $value !== '-') {
618+
self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token);
619+
}
620+
// Before fetching the precedence, we must translate ^ (caret) into **, because
621+
// the ^ operator normally has a different meaning with lower precedence.
622+
if ($value === '^') {
623+
$value = '**';
624+
}
625+
$thisprecedence = self::get_precedence($value);
626+
// We artificially increase the precedence of the division operator, because
627+
// legacy versions used implicit parens around the denominator, e. g.
628+
// the expression J / m K would be interpreted as J / (m * K). This is consistent
629+
// with what tools like Wolfram Alpha do, even though e. g. 1 / 2 3 would be read
630+
// as 3/2 both by Formulas Question and Wolfram Alpha. And even if it were not, it
631+
// is not possible to change that, because it could break existing questions.
632+
if ($value === '*') {
633+
$thisprecedence++;
634+
}
635+
// Flush operators with higher precedence, unless we have a unary minus, because
636+
// it is not left-associative.
637+
if ($value !== '-') {
638+
self::flush_higher_precedence($opstack, $thisprecedence, $output);
639+
}
640+
// Put the operator on the stack.
641+
$opstack[] = $token;
642+
break;
643+
644+
// If we still haven't dealt with the token, there must be a problem with the input.
645+
default:
646+
self::die(get_string('error_unexpectedtoken', 'qtype_formulas', $value), $token);
647+
648+
}
649+
650+
$lasttoken = $token;
651+
}
652+
// After last token, flush opstack. Last token must be either a number (in exponent),
653+
// a closing parenthesis or a unit.
654+
self::flush_all($opstack, $output);
655+
return $output;
656+
}
657+
531658
/**
532659
* Stop processing and indicate the human readable position (row/column) where the error occurred.
533660
*

classes/local/token.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ class token {
123123
/** @var int used to designate a token storing an end-of-group marker (closing brace) */
124124
const END_GROUP = 4194304;
125125

126+
/** @var int used to designate a token storing a unit */
127+
const UNIT = 8388608;
128+
126129
/** @var mixed the token's content, will be the name for identifiers */
127130
public $value;
128131

0 commit comments

Comments
 (0)