Nice to have a lexertl14::debug::dump(lrules) like parsertl14::debug::dump(grules) #10

mingodad · 2023-07-30T10:43:26Z

I started a possible implementation of lexertl14::debug::dump(lrules) but I'm having difficult to understand the data structures to be able to dump then.

Any help is welcome !

        static void dump(const rules& rules_, ostream& stream_, bool asEbnfRR = false)
        {
            if(rules_.statemap().size() > 0)
            {
                stream_ << "%x";
                for (auto it = rules_.statemap().begin(); it != rules_.statemap().end(); ++it) {
                    stream_ << " " << it->first;
                }
                stream_ << "\n";
            }

            if(rules_.macrosmap().size() > 0)
            {
                stream_ << "%%\n";
                for (auto it = rules_.macrosmap().begin(); it != rules_.macrosmap().end(); ++it) {
                    stream_ << it->first << "\t";
                    size_t i = 0;
                    for (auto itt = it->second.begin(); itt != it->second.end(); ++itt) {
                        stream_ << ++i << " ";
                    }
                    stream_ << "\n";
                }
                stream_ << "\n%%\n";
            }

            if(rules_.regexes().size() > 0)
            {
                size_t i = 0;
                for (auto it = rules_.regexes().begin(); it != rules_.regexes().end(); ++it) {
                    stream_ << " " << ++i << "\n";
                }
                stream_ << "\n";
            }
        }

The text was updated successfully, but these errors were encountered:

mingodad · 2023-08-03T20:46:16Z

This is what I've got so far:

        static void dumpRegex(ostream& stream_, const typename rules::token_vector& tokens)
        {
            //size_t i = 0;
            for (const auto& token : tokens) {
                //stream_ << ++i << ":" << " ";
                using tok_type = lexertl::detail::token_type;
                switch(token._type)
                {
                    case tok_type::BEGIN: ; break;
                    case tok_type::REGEX: stream_ << "REGEX "; break;
                    case tok_type::OREXP: stream_ << "OREXP "; break;
                    case tok_type::SEQUENCE: stream_ << "SEQUENCE "; break;
                    case tok_type::SUB: stream_ << "SUB "; break;
                    case tok_type::EXPRESSION: stream_ << "EXPRESSION "; break;
                    case tok_type::REPEAT: stream_ << "REPEAT "; break;
                    case tok_type::DUP: stream_ << "DUP "; break;
                    case tok_type::OR: stream_ << "|"; break;
                    case tok_type::CHARSET:
                    {
                        stream_ << "[";
                        for(const auto& range: token._str._ranges)
                        {
                            stream_ << range.first;
                            if(range.first != range.second)
                            {
                                stream_ << "-" << range.second;
                            }
                        }
                        stream_ << "]";
                        break;
                    }
                    case tok_type::BOL: stream_ << "^"; break;
                    case tok_type::EOL: stream_ << "$"; break;
                    case tok_type::MACRO: stream_ << "{MACRO}"; break;
                    case tok_type::OPENPAREN: stream_ << "("; break;
                    case tok_type::CLOSEPAREN: stream_ << ")"; break;
                    case tok_type::OPT: stream_ << "?"; break;
                    case tok_type::AOPT: stream_ << "??"; break;
                    case tok_type::ZEROORMORE: stream_ << "*"; break;
                    case tok_type::AZEROORMORE: stream_ << "*?"; break;
                    case tok_type::ONEORMORE: stream_ << "+"; break;
                    case tok_type::AONEORMORE: stream_ << "+?"; break;
                    case tok_type::REPEATN: stream_ << "{REPEATN}"; break;
                    case tok_type::AREPEATN: stream_ << "{REPEATN}?"; break;
                    case tok_type::END: ; break;
                    case tok_type::DIFF: stream_ << "DIFF "; break;

                    default:
                         stream_ << " @^-_-^@ ";
                }
            }
        }

        static void dump(const rules& rules_, ostream& stream_, bool asEbnfRR = false)
        {
            if(rules_.statemap().size() > 0)
            {
                stream_ << "%x";
                for (auto it = rules_.statemap().begin(); it != rules_.statemap().end(); ++it) {
                    stream_ << " " << it->first;
                }
                stream_ << "\n";
            }

            if(rules_.macrosmap().size() > 0)
            {
                stream_ << "%%\n";
                for(const auto& [name, tokens]: rules_.macrosmap()) {
                    stream_ << name << "\t";
                    dumpRegex(stream_, tokens);
                    stream_ << "\n";
                }
                stream_ << "\n%%\n";
            }

            stream_ << "Rules.regexes = " << rules_.regexes().size() << "\n%%\n";
            if(rules_.regexes().size() > 0)
            {
                size_t i = 0;
                for(const auto& regex :  rules_.regexes())
                {
                    stream_ << "* " << ++i << "\n";
                    size_t i2 = 0;
                    for(const auto& tok_vec :  regex)
                    {
                        dumpRegex(stream_, tok_vec);
                        stream_ << "   " << ++i2 << "\n";
                    }
                }
                stream_ << "\n";
            }
        }

BenHanson · 2023-08-04T20:24:11Z

Current version: debug.txt

This version includes reinstatement of MACROs.

mingodad · 2023-10-09T10:39:09Z

Hello @BenHanson your suggested debug.txt doesn't manage properly \\c it outputs (\\\c), possible fix:

        static void dump_charset(const string_token& in_token_, ostream& stream_)
        {
...
                case static_cast<char_type>('/'):
                case static_cast<char_type>('"'):
                    stream_ << static_cast<char_type>('\\');
                    break;
                case static_cast<char_type>('\\'):
                    if(range_.first != range_.second)
                    {
                        stream_ << static_cast<char_type>('\\');
                    }
                    break;
                default:
                    break;

Also it doesn't dump other dfa states other than INITIAL and also doesn't dump the push states when they exists.

Have you made any progress on it ?

mingodad · 2023-10-10T11:38:45Z

With this commit mingodad/parsertl-playground@d131c5a I'm getting a dump that includes all lexer states with the exit states and named terminals instead of {return \d;}.

mingodad · 2023-10-10T12:05:13Z

There is a problem with escaping characters because it's been done in several places and they seen to overlap (mainly calling string_token::escape_char() from void dump_charset(const string_token& in_token_, ostream& stream_).

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Nice to have a lexertl14::debug::dump(lrules) like parsertl14::debug::dump(grules) #10

Nice to have a lexertl14::debug::dump(lrules) like parsertl14::debug::dump(grules) #10

mingodad commented Jul 30, 2023

mingodad commented Aug 3, 2023

BenHanson commented Aug 4, 2023

mingodad commented Oct 9, 2023

mingodad commented Oct 10, 2023

mingodad commented Oct 10, 2023

Nice to have a lexertl14::debug::dump(lrules) like parsertl14::debug::dump(grules) #10

Nice to have a lexertl14::debug::dump(lrules) like parsertl14::debug::dump(grules) #10

Comments

mingodad commented Jul 30, 2023

mingodad commented Aug 3, 2023

BenHanson commented Aug 4, 2023

mingodad commented Oct 9, 2023

mingodad commented Oct 10, 2023

mingodad commented Oct 10, 2023