diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..dd98f9e --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,21 @@ +version: 2 + +jobs: + build: + docker: + - image: swipl:stable + + steps: + - run: + # TODO Build custom image to improve build time + name: Install Deps + command: | + apt update -y + apt install git make -y + + - checkout + + - run: + name: Run tests + command: | + make test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b25c15b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..08dd184 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,44 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog][keep-a-change-log], and this project +adheres to [Semantic Versioning][semantic-versioning]. + +[keep-a-change-log]: https://keepachangelog.com/en/1.0.0/ +[semantic-versioning]: https://semver.org/spec/v2.0.0.html + +## [unreleased] + +## [1.0.0] + +### Added + +- Support for numbers by [@Annipoo](https://github.com/Anniepoo) #34 +- Support for strings #37 +- Code of Conduct #23 + +### Changed + +- Spaces are now tagged with `space` instead of `spc` #41 +- Tokenization of numbers and strings is enabled by default #40 +- Options are now processed by a more conventional means #39 +- The location for the pack's home is updated + +## [0.1.2] + +Prior to changelog. + +## [0.1.1] + +Prior to changelog. + +## [0.1.0] + +Prior to changelog. + +[unreleased]: https://github.com/shonfeder/tokenize/compare/v1.0.0...HEAD +[1.0.0]: https://github.com/shonfeder/tokenize/compare/v0.1.2...v1.0.0 +[0.1.2]: https://github.com/shonfeder/tokenize/compare/v0.1.1...v0.1.2 +[0.1.1]: https://github.com/shonfeder/tokenize/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/shonfeder/tokenize/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 87eda1c..d1ae63f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,20 +5,74 @@ reports, etc. ## Code of Conduct -Please review and accept to our [code of conduct](CODE_OF_CONDUCT.md) prior to +Please review and accept our [code of conduct](CODE_OF_CONDUCT.md) prior to engaging in the project. +## Overall direction and aims + +Consult the [`design_notes.md`](design_notes.md) to see the latest codified +design philosophy and principles. + ## Setting up Development -TODO +1. Install swi-prolog's [swipl](http://www.swi-prolog.org/download/stable). + - Optionally, you may wish to use [swivm](https://github.com/fnogatz/swivm) to + manage multiple installed versions of swi-prolog. +2. Hack on the source code in `[./prolog](./prolog)`. +3. Run and explore your changes by loading the file in `swipl` (or using your + editors IDE capabilities): + - Example in swipl + + ```prolog + # in ~/oss/tokenize on git:develop x [22:45:02] + $ cd ./prolog + + # in ~/oss/tokenize/prolog on git:develop x [22:45:04] + $ swipl + Welcome to SWI-Prolog (threaded, 64 bits, version 8.0.2) + SWI-Prolog comes with ABSOLUTELY NO WARRANTY. This is free software. + Please run ?- license. for legal details. + + For online help and background, visit http://www.swi-prolog.org + For built-in help, use ?- help(Topic). or ?- apropos(Word). + + % lod the tokenize module + ?- [tokenize]. + true. + + % experiment + ?- tokenize("Foo bar baz", Tokens). + Tokens = [word(foo), space(' '), word(bar), space(' '), word(baz)]. + + % reload the module when you make changes to the source code + ?- make. + % Updating index for library /usr/local/Cellar/swi-prolog/8.0.2/libexec/lib/swipl/library/ + true. + + % finished + ?- halt. + ``` + +Please ask here or in `##prolog` on [freenode](https://freenode.net/) if you +need any help! :) ## Running tests Tests are located in the [`./test`](./test) directory. To run the test suite, -simply execute the test file: +simply execute make test: ```sh -$ ./test/test.pl +$ make test +% PL-Unit: tokenize .. done +% All 2 tests passed +``` + +If inside the swipl repl, make sure to load the test file and query run_tests. + +```prolog +?- [test/test]. +?- run_tests. % PL-Unit: tokenize .. done % All 2 tests passed +true. ``` diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..044b64f --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +.PHONY: all test clean + +version := $(shell swipl -q -s pack -g 'version(V),writeln(V)' -t halt) +packfile = quickcheck-$(version).tgz + +SWIPL := swipl + +all: test + +version: + echo $(version) + +check: test + +install: + echo "(none)" + +test: + @$(SWIPL) -s test/test.pl -g 'run_tests,halt(0)' -t 'halt(1)' diff --git a/README.md b/README.md index 82ec7d1..47ac380 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,30 @@ -# Synopsis +# `pack(tokenize) :-` + +A modest tokenization library for SWI-Prolog, seeking a balance between +simplicity and flexibility. + +[![CircleCI](https://circleci.com/gh/shonfeder/tokenize.svg?style=svg)](https://circleci.com/gh/shonfeder/tokenize) + +## Synopsis ```prolog ?- tokenize(`\tExample Text.`, Tokens). -Tokens = [cntrl('\t'), word(example), spc(' '), spc(' '), word(text), punct('.')] +Tokens = [cntrl('\t'), word(example), space(' '), space(' '), word(text), punct('.')] ?- tokenize(`\tExample Text.`, Tokens, [cntrl(false), pack(true), cased(true)]). -Tokens = [word('Example', 1), spc(' ', 2), word('Text', 1), punct('.', 1)] +Tokens = [word('Example', 1), space(' ', 2), word('Text', 1), punct('.', 1)] ?- tokenize(`\tExample Text.`, Tokens), untokenize(Tokens, Text), format('~s~n', [Text]). example text. -Tokens = [cntrl('\t'), word(example), spc(' '), spc(' '), word(text), punct('.')], -Text = [9, 101, 120, 97, 109, 112, 108, 101, 32|...] +Tokens = [cntrl('\t'), word(example), space(' '), space(' '), word(text), punct('.')], +Text = [9, 101, 120, 97, 109, 112, 108, 101, 32|...] ``` -# Description +## Description Module `tokenize` aims to provide a straightforward tool for tokenizing text into a simple format. It is the result of a learning exercise, and it is far from perfect. If there is sufficient interest from myself or anyone else, I'll try to improve it. -It is packaged as an SWI-Prolog pack, available [here](http://www.swi-prolog.org/pack/list?p=tokenize). Install it into your SWI-Prolog system with the query +It is packaged as an SWI-Prolog pack, available [here](http://www.swi-prolog.org/pack/list?p=tokenize). Install it into your SWI-Prolog system with the query ```prolog ?- pack_install(tokenize). @@ -25,6 +32,6 @@ It is packaged as an SWI-Prolog pack, available [here](http://www.swi-prolog.org Please [visit the wiki](https://github.com/aBathologist/tokenize/wiki/tokenize.pl-options-and-examples) for more detailed instructions and examples, including a full list of options supported. -# Contributing +## Contributing See [CONTRIBUTING.md](./CONTRIBUTING.md). diff --git a/comment-wip/README.md b/comment-wip/README.md new file mode 100644 index 0000000..c1c3fd9 --- /dev/null +++ b/comment-wip/README.md @@ -0,0 +1,4 @@ +WIP code towards tokenization of comments. + +It was extracted here because it's not ready for release, but we want to keep it +available for the author to resume work on it. diff --git a/comment-wip/comment.pl b/comment-wip/comment.pl new file mode 100644 index 0000000..cea7fd6 --- /dev/null +++ b/comment-wip/comment.pl @@ -0,0 +1,115 @@ +:- module(comment, + [comment//2, + comment_rec//2, + comment_token//3, + comment_token_rec//3]). + +/** Tokenizing comments +This module defines matchers for comments used by the tokenize module. (Note +that we will use matcher as a name for dcg rules that match parts of the codes +list). + +@author Stefan Israelsson Tampe +@license LGPL v2 or later + +Interface Note: +Start and End matchers is a matcher (dcg rule) that is either evaluated with no +extra argument (--> call(StartMatcher)) and it will just match it's token or it +can have an extra argument producing the codes matched by the matcher e.g. used +as --> call(StartMatcher,MatchedCodes). The matchers match start and end codes +of the comment, the 2matcher type will represent these kinds of dcg rules or +matchers 2 is because they support two kinds of arguments to the dcg rules. +For examples +see: + + @see tests/test_comments.pl + +The matchers predicates exported and defined are: + + comment(+Start:2matcher,+End:2matcher) + - anonymously match a non recursive comment + + comment_rec(+Start:2matcher,+End:2matcher,2matcher) + - anonymously match a recursive comment + + coment_token(+Start:2matcher,+End:2matcher,-Matched:list(codes)) + - match an unrecursive comment outputs the matched sequence used + for building a resulting comment token + + coment_token_rec(+Start:2matcher,+End:2matcher,-Matched:list(codes)) + - match an recursive comment outputs the matched sequence used + for building a resulting comment token +*/ + + + +%% comment(+Start:2matcher,+End:2matcher) +% non recursive non tokenizing matcher + +comment_body(E) --> call(E),!. +comment_body(E) --> [_],comment_body(E). + +comment(S,E) --> + call(S), + comment_body(E). + +%% comment_token(+Start:2matcher,+End:2matcher,-Matched:list(codes)) +% non recursive tokenizing matcher + +comment_body_token(E,Text) --> + call(E,HE),!, + {append(HE,[],Text)}. + +comment_body_token(E,[X|L]) --> + [X], + comment_body_token(E,L). + +comment_token(S,E,Text) --> + call(S,HS), + {append(HS,T,Text)}, + comment_body_token(E,T). + +%% comment_token_rec(+Start:2matcher,+End:2matcher,-Matched:list(codes)) +% recursive tokenizing matcher + +% Use this as the initial continuation, will just tidy up the matched result +% by ending the list with []. +comment_body_rec_start(_,_,[]). + +comment_body_token_rec(_,E,Cont,Text) --> + call(E,HE),!, + {append(HE,T,Text)}, + call(Cont,T). + +comment_body_token_rec(S,E,Cont,Text) --> + call(S,HS),!, + {append(HS,T,Text)}, + comment_body_token_rec(S,E,comment_body_token_rec(S,E,Cont),T). + +comment_body_token_rec(S,E,Cont,[X|L]) --> + [X], + comment_body_token_rec(S,E,Cont,L). + +comment_token_rec(S,E,Text) --> + call(S,HS), + {append(HS,T,Text)}, + comment_body_token_rec(S,E,comment_body_rec_start,T). + +%% comment_rec(+Start:2matcher,+End:2matcher) +% recursive non tokenizing matcher + +comment_body_rec(_,E) --> + call(E),!. + +comment_body_rec(S,E) --> + call(S),!, + comment_body_rec(S,E), + comment_body_rec(S,E). + +comment_body_rec(S,E) --> + [_], + comment_body_rec(S,E). + +comment_rec(S,E) --> + call(S), + comment_body_rec(S,E). diff --git a/comment-wip/test_comments.pl b/comment-wip/test_comments.pl new file mode 100644 index 0000000..aa7f907 --- /dev/null +++ b/comment-wip/test_comments.pl @@ -0,0 +1,104 @@ +:- dynamic user:file_search_path/2. +:- multifile user:file_search_path/2. + +% Add the package source files relative to the current file location +:- prolog_load_context(directory, Dir), + atom_concat(Dir, '/../prolog', PackageDir), + asserta(user:file_search_path(package, PackageDir)). + +:- use_module(package(comment)). +:- begin_tests(tokenize_comment). + +id(X) --> {atom_codes(X,XX)},XX. +id(X,XX) --> {atom_codes(X,XX)},XX. + +mytest(Tok,S,U) :- + atom_codes(S,SS), + call_dcg(Tok,SS,U). + +test_comment(S) :- + mytest(comment(id('<'),id('>')),S,[]). + +test_comment_rec(S) :- + mytest(comment_rec(id('<'),id('>')),S,[]). + +test_comment_token(S,T) :- + mytest(comment_token(id('<'),id('>'),TT),S,[]), + atom_codes(T,TT). + +test_comment_token_rec(S,T) :- + mytest(comment_token_rec(id('<'),id('>'),TT),S,[]), + atom_codes(T,TT). + +start(AA) :- + ( + catch(b_getval(a,[N,A]),_,N=0) -> + true; + N=0 + ), + NN is N + 1, + ( + N == 0 -> + AA = _; + AA = A + ), + b_setval(a,[NN,AA]). + +end(A) :- + b_getval(a,[N,A]), + NN is N - 1, + b_setval(a,[NN,A]). + +left(A) --> + {atom_codes(A,AA)}, + AA, + {start(B)}, + [B]. + +left(A,C) --> + {atom_codes(A,AA)}, + AA, + {start(B)}, + [B], + {append(AA,[B],C)}. + +right(A) --> + {end(B)}, + [B], + {atom_codes(A,AA)}, + AA. + +right(A,C) --> + {end(B)}, + [B], + {atom_codes(A,AA)}, + AA, + {append([B],AA,C)}. + +test_adapt(S,T) :- + mytest(comment_token_rec(left('<'),right('>'),TT),S,[]), + atom_codes(T,TT). + + +:- multifile test/2. + +test('Test comment',[true(test_comment(''))]) :- true. +test('Test comment_rec',[true(test_comment_rec('>'))]) :- true. +test('Test comment_token',[true(A == B)]) :- + A='', + test_comment_token(A,B). + +test('Test comment_token_rec',[true(A == B)]) :- + A='>', + test_comment_token(A,B). + +test('Test comment_token_rec advanced 1',[true(A == B)]) :- + A='<1 alla2> <1 balla2> 1>1>', + test_adapt(A,B). + +test('Test comment_token_rec advanced 2',[true(A == B)]) :- + A='<2 alla1> <2 balla1> 2>2>', + test_adapt(A,B). + + +:- end_tests(tokenize_comment). diff --git a/design_notes.md b/design_notes.md new file mode 100644 index 0000000..e84fade --- /dev/null +++ b/design_notes.md @@ -0,0 +1,45 @@ +# Design Notes + +Initially extracted from conversation with +[@Annieppo](https://github.com/Anniepoo) and [@nicoabie](https://github.com/nicoabie) in +##prolog on [freenode](https://freenode.net/). + +The library started as a very simple and lightweight set of predicates for a +common, but very limited, form of lexing. As we extend it, we aim to maintain a +modest scope in order to achieve a sweet spot between ease of use and powerful +flexibility. + +## Scope and Aims + +`tokenize` does not aspire to become an industrial strength lexer generator. We +aim to serve most users needs between raw input and a structured form ready for +parsing by a DCG. + +If a user is parsing a language with keywords such as `class`, `module`, etc., +and wants to distinguish these from variable names, `tokenize` isn't going to +give you this out of the box. But, it should provide an easy means of achieving +this result through a subsequent lexing pass. + +## Some Model Users + +* somebody making a computer language + * needs to be able to distinguish keywords, variables and literals + * needs to be able to identify comments +* somebody making a parser for an interactive fiction game + * needs to handle stuff like "William O. N'mutu-O'Connell went to the market" +* somebody wanting to analyze human texts + * wanting to do some analysis on New York Times articles, they want to first + process the articles into meaningful tokens + +## Design Rules + +* We don't parse. +* Every token generated is callable (i.e., an atom or compound). + * Example of an possible compound token: `space(' ')`. + * Example of a possible atom token: `escape`. + tokenization need to return tokens represented with the same arity) +* Users should be able to determine the kind of token by unification. +* Users should be able to clearly see and specify the precedence for tokenizaton + * E.g., given `"-12.3"`, `numbers, punctuation` should yield `[pnct('-'), + number(12), pnct('.'), number(3)]` while `punctuation, numbers` should yield + `[number(-12.3)]`. diff --git a/pack.pl b/pack.pl index 174019f..68438aa 100644 --- a/pack.pl +++ b/pack.pl @@ -1,10 +1,10 @@ name(tokenize). -title('A nascent tokenization library'). +title('A simple tokenization library'). -version('0.1.2'). -download('https://github.com/aBathologist/tokenize/release/*.zip'). +version('1.0.0'). +download('https://github.com/shonfeder/tokenize/release/*.zip'). author('Shon Feder', 'shon.feder@gmail.com'). packager('Shon Feder', 'shon.feder@gmail.com'). maintainer('Shon Feder', 'shon.feder@gmail.com'). -home('https://github.com/aBathologist/tokenize'). +home('https://github.com/shonfeder/tokenize'). diff --git a/prolog/tokenize.pl b/prolog/tokenize.pl index a177bf9..6923d64 100644 --- a/prolog/tokenize.pl +++ b/prolog/tokenize.pl @@ -25,6 +25,12 @@ */ +:- use_module(library(dcg/basics), [eos//0, number//1]). +:- use_module(tokenize_opts). + +% Ensure we interpret back ticks as enclosing code lists in this module. +:- set_prolog_flag(back_quotes, codes). + %% tokenize(+Text:list(code), -Tokens:list(term)) is semidet. % % @see tokenize/3 is called with an empty list of options: thus, with defaults. @@ -47,23 +53,33 @@ % * a word (contiguous alpha-numeric chars): `word(W)` % * a punctuation mark (determined by `char_type(C, punct)`): `punct(P)` % * a control character (determined by `char_typ(C, cntrl)`): `cntrl(C)` -% * a space ( == ` `): `spc(S)`. +% * a space ( == ` `): `space(S)`. % -% Valid options are: +% Valid options are: % -% * cased(+bool) : Determines whether tokens perserve cases of the source text. -% * spaces(+bool) : Determines whether spaces are represted as tokens or discarded. -% * cntrl(+bool) : Determines whether control characters are represented as tokens or discarded. -% * punct(+bool) : Determines whether punctuation characters are represented as tokens or discarded. -% * to(+on_of([strings,atoms,chars,codes])) : Determines the representation format used for the tokens. +% * cased(+bool) : Determines whether tokens perserve cases of the source +% text. +% * spaces(+bool) : Determines whether spaces are represted as tokens or +% discarded. +% * cntrl(+bool) : Determines whether control characters are represented as +% tokens or discarded. +% * punct(+bool) : Determines whether punctuation characters are represented +% as tokens or discarded. % * pack(+bool) : Determines whether tokens are packed or repeated. +% * to(+one_of([strings,atoms,chars,codes])) : Determines the representation +% format used for the tokens. -% TODO is it possible to achieve the proper semidet without the cut? +% TODO is it possible to achieve the proper semidet without the cut? +% Annie sez some parses are ambiguous, not even sure the cut should be +% there -tokenize(Text, Tokens, Options) :- +tokenize(Text, ProcessedTokens, Options) :- must_be(nonvar, Text), string_codes(Text, Codes), - phrase(process_options, [Options-Codes], [Options-Tokens]), + process_options(Options, PreOpts, TokenOpts, PostOpts), + preprocess(PreOpts, Codes, ProcessedCodes), + phrase(tokens(TokenOpts, Tokens), ProcessedCodes), + postprocess(PostOpts, Tokens, ProcessedTokens), !. %% untokenize(+Tokens:list(term), -Untokens:list(codes)) is semidet. @@ -112,104 +128,62 @@ read_file_to_codes(File, Codes, [encoding(utf8)]), tokenize(Codes, Tokens, Options). -% PROCESSING OPTIONS -% -% NOTE: This way of processing options is probably stupid. -% I will correct/improve/rewrite it if there is ever a good -% reason to. But for now, it works. -% -% TODO: Throw exception if invalid options are passed in. -% At the moment it just fails. - -%% Dispatches dcgs by option-list functors, with default values. -process_options --> - opt(cased, false), - non_opt(tokenize_text), - opt(spaces, true), - opt(cntrl, true), - opt(punct, true), - opt(to, atoms), - opt(pack, false). - -%% opt(+OptionFunctor:atom, DefaultValue:nonvar) -% -% If dcg functor is identical to the option name with 'opt_' prefixed, -% then the dcg functor can be omitted. - -opt(Opt, Default) --> - { atom_concat('opt_', Opt, Opt_DCG) }, - opt(Opt, Default, Opt_DCG). - -%% opt(+OptionFunctor:atom, +DefaultValue:nonvar, +DCGFunctor:atom). -opt(Opt, Default, DCG) --> - state(Opts-Text0, Text0), - { - pad(Opt, Selection, Opt_Selection), - option(Opt_Selection, Opts, Default), - DCG_Selection =.. [DCG, Selection] - }, - DCG_Selection, - state(Text1, Opts-Text1). -%% This ugly bit should be dispensed with... -opt(Opt, Default, _) --> - state(Opts-_), - { - var(Default), \+ option(Opt, Opts), - writeln("Unknown options passed to opt//3: "), - write(Opt) - }. - -%% non_opt(+DCG). -% -% Non optional dcg to dispatch. Passes the object of concern -% without the options list, then recovers option list. - -non_opt(DCG) --> - state(Opts-Text0, Text0), - DCG, - state(Text1, Opts-Text1). - -state(S0), [S0] --> [S0]. -state(S0, S1), [S1] --> [S0]. - -%% Dispatching options: - -opt_cased(true) --> []. -opt_cased(false) --> state(Text, LowerCodes), - { - text_to_string(Text, Str), - string_lower(Str, LowerStr), - string_codes(LowerStr, LowerCodes) - }. - -tokenize_text --> state(Text, Tokenized), - { phrase(tokens(Tokenized), Text) }. - -opt_spaces(true) --> []. -opt_spaces(false) --> state(T0, T1), - { exclude( =(spc(_)), T0, T1) }. - -opt_cntrl(true) --> []. -opt_cntrl(false) --> state(T0, T1), - { exclude( =(cntrl(_)), T0, T1) }. - -opt_punct(true) --> []. -opt_punct(false) --> state(T0, T1), - { exclude( =(punct(_)), T0, T1) }. -opt_to(codes) --> []. -opt_to(Type) --> state(CodeTokens, Tokens), - { maplist(token_to(Type), CodeTokens, Tokens) }. +/*********************************** +* {PRE,POST}-PROCESSING HELPERS * +***********************************/ -opt_pack(false) --> []. -opt_pack(true) --> state(T0, T1), - { phrase(pack_tokens(T1), T0) }. - - - -%% POST PROCESSING - -%% Convert tokens to alternative representations. +preprocess(PreOpts, Codes, ProcessedCodes) :- + preopts_data(cased, PreOpts, Cased), + DCG_Rules = ( + preprocess_case(Cased) + ), + phrase(process_dcg_rules(DCG_Rules, ProcessedCodes), Codes). + +postprocess(PostOpts, Tokens, ProcessedTokens) :- + postopts_data(spaces, PostOpts, Spaces), + postopts_data(cntrl, PostOpts, Cntrl), + postopts_data(punct, PostOpts, Punct), + postopts_data(to, PostOpts, To), + postopts_data(pack, PostOpts, Pack), + DCG_Rules = ( + keep_token(space(_), Spaces), + keep_token(cntrl(_), Cntrl), + keep_token(punct(_), Punct), + convert_token(To) + ), + phrase(process_dcg_rules(DCG_Rules, PrePackedTokens), Tokens), + (Pack + -> phrase(pack_tokens(ProcessedTokens), PrePackedTokens) + ; ProcessedTokens = PrePackedTokens + ). + + +/*********************************** +* POSTPROCESSING HELPERS * +***********************************/ + +% Process a stream through a pipeline of DCG rules +process_dcg_rules(_, []) --> eos, !. +process_dcg_rules(DCG_Rules, []) --> DCG_Rules, eos, !. +process_dcg_rules(DCG_Rules, [C|Cs]) --> + DCG_Rules, + [C], + process_dcg_rules(DCG_Rules, Cs). + +preprocess_case(true), [C] --> [C]. +preprocess_case(false), [CodeOut] --> [CodeIn], + { to_lower(CodeIn, CodeOut) }. + +keep_token(_, true), [T] --> [T]. +keep_token(Token, false) --> [Token]. +keep_token(Token, false), [T] --> [T], {T \= Token}. + +convert_token(Type), [Converted] --> [Token], + {token_to(Type, Token, Converted)}. + +% Convert tokens to alternative representations. +token_to(_, number(X), number(X)) :- !. token_to(Type, Token, Converted) :- ( Type == strings -> Conversion = inverse(string_codes) ; Type == atoms -> Conversion = inverse(atom_codes) @@ -218,9 +192,7 @@ ), call_into_term(Conversion, Token, Converted). - -%% Packing repeating tokens -% +% Packing repeating tokens pack_tokens([T]) --> pack_token(T). pack_tokens([T|Ts]) --> pack_token(T), pack_tokens(Ts). @@ -228,37 +200,73 @@ pack(X, Count) --> [X], pack(X, 1, Count). -pack(_, Total, Total) --> call(eos). +pack(_, Total, Total) --> eos. pack(X, Total, Total), [Y] --> [Y], { Y \= X }. pack(X, Count, Total) --> [X], { succ(Count, NewCount) }, pack(X, NewCount, Total). +/************************** +* TOKENIZATION * +**************************/ + +tokenize_text --> state(Text, Tokenized), + { phrase(tokens(Tokenized), Text) }. + % PARSING -tokens([T]) --> token(T), call(eos), !. -tokens([T|Ts]) --> token(T), tokens(Ts). +tokens(Opts, [T]) --> token(Opts, T), eos, !. +tokens(Opts, [T|Ts]) --> token(Opts, T), tokens(Opts, Ts). % NOTE for debugging % tokens(_) --> {length(L, 200)}, L, {format(L)}, halt, !. -token(word(W)) --> word(W), call(eos), !. -token(word(W)),` ` --> word(W), ` `. -token(word(W)), C --> word(W), (punct(C) ; cntrl(C) ; nasciis(C)). -token(spc(S)) --> spc(S). -token(punct(P)) --> punct(P). -token(cntrl(C)) --> cntrl(C). -token(other(O)) --> nasciis(O). +token(Opts, string(S)) --> + { tokenopts_data(strings, Opts, true) }, + string(S). + +token(Opts, number(N)) --> + { tokenopts_data(numbers, Opts, true) }, + number(N), !. +token(_Opts, word(W)) --> word(W), eos, !. +token(_Opts, word(W)),` ` --> word(W), ` `. +token(_Opts, word(W)), C --> word(W), (punct(C) ; cntrl(C) ; nasciis(C)). -spc(` `) --> ` `. +token(_Opts, space(S)) --> space(S). +token(_Opts, punct(P)) --> punct(P). +token(_Opts, cntrl(C)) --> cntrl(C). +token(_Opts, other(O)) --> nasciis(O). + +space(` `) --> ` `. sep --> ' '. -sep --> call(eos), !. +sep --> eos, !. word(W) --> csyms(W). +% TODO Make open and close brackets configurable +string(S) --> string(`"`, `"`, S). +string(OpenBracket, CloseBracket, S) --> string_start(OpenBracket, CloseBracket, S). + +% A string starts when we encounter an OpenBracket +string_start(OpenBracket, CloseBracket, Cs) --> + OpenBracket, string_content(OpenBracket, CloseBracket, Cs). + +% String content is everything up until we hit a CloseBracket +string_content(_OpenBracket, CloseBracket, []) --> CloseBracket, !. +% String content includes a bracket following an escape, but not the escape +string_content(OpenBracket, CloseBracket, [C|Cs]) --> + escape, (CloseBracket | OpenBracket), + {[C] = CloseBracket}, + string_content(OpenBracket, CloseBracket, Cs). +% String content includes any character that isn't a CloseBracket or an escape. +string_content(OpenBracket, CloseBracket, [C|Cs]) --> + [C], + {[C] \= CloseBracket}, + string_content(OpenBracket, CloseBracket, Cs). + csyms([L]) --> csym(L). csyms([L|Ls]) --> csym(L), csyms(Ls). @@ -266,7 +274,7 @@ % non ascii's -nasciis([C]) --> nascii(C), (call(eos), !). +nasciis([C]) --> nascii(C), eos, !. nasciis([C]),[D] --> nascii(C), [D], {D < 127}. nasciis([C|Cs]) --> nascii(C), nasciis(Cs). @@ -275,6 +283,9 @@ ' ' --> space. ' ' --> space, ' '. +escape --> `\\`. + +% Any ... --> []. ... --> [_], ... . @@ -283,8 +294,6 @@ punct([P]) --> [P], {code_type(P, punct)}. cntrl([C]) --> [C], {code_type(C, cntrl)}. -eos([], []). - %% move to general module codes_to_lower([], []). diff --git a/prolog/tokenize_opts.pl b/prolog/tokenize_opts.pl new file mode 100644 index 0000000..688077e --- /dev/null +++ b/prolog/tokenize_opts.pl @@ -0,0 +1,40 @@ +:- module(tokenize_opts, + [process_options/4, + preopts_data/3, + tokenopts_data/3, + postopts_data/3]). + +:- use_module(library(record)). + +% pre-processing options +:- record preopts( + cased:boolean=false + ). + +% tokenization options +:- record tokenopts( + numbers:boolean=true, + strings:boolean=true + ). + +% post-processing options +:- record postopts( + spaces:boolean=true, + cntrl:boolean=true, + punct:boolean=true, + to:oneof([strings,atoms,chars,codes])=atoms, + pack:boolean=false + ). + +%% process_options(+Options:list(term), -PreOpts:term, -PostOpts:term) is semidet. +% +process_options(Options, PreOpts, TokenOpts, PostOpts) :- + make_preopts(Options, PreOpts, Rest0), + make_postopts(Rest0, PostOpts, Rest1), + make_tokenopts(Rest1, TokenOpts, InvalidOpts), + throw_on_invalid_options(InvalidOpts). + +throw_on_invalid_options(InvalidOpts) :- + InvalidOpts \= [] + -> throw(invalid_options_given(InvalidOpts)) + ; true. diff --git a/test/test.pl b/test/test.pl index 49b1857..9e17e36 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,18 +1,3 @@ -#!/usr/bin/env swipl -/** Unit tests for the tokenize library - * - * To run these tests, execute this file - * - * ./test/test.pl - */ - -:- initialization(main, main). - -main(_Argv) :- - run_tests. - -:- begin_tests(tokenize). - :- dynamic user:file_search_path/2. :- multifile user:file_search_path/2. @@ -22,21 +7,149 @@ asserta(user:file_search_path(package, PackageDir)). :- use_module(package(tokenize)). +:- use_module(package(tokenize_opts)). -% TESTS START HERE +:- begin_tests(tokenize). test('Hello, Tokenize!', [true(Actual == Expected)] ) :- tokenize("Hello, Tokenize!", Actual), - Expected = [word(hello),punct(','),spc(' '),word(tokenize),punct(!)]. + Expected = [word(hello),punct(','),space(' '),word(tokenize),punct(!)]. test('Goodbye, Tokenize!', [true(Actual == Expected)] ) :- - Tokens = [word('Goodbye'),punct(','),spc(' '),word('Tokenize'),punct('!')], + Tokens = [word('Goodbye'),punct(','),space(' '),word('Tokenize'),punct('!')], untokenize(Tokens, Codes), string_codes(Actual, Codes), Expected = "Goodbye, Tokenize!". + +% OPTION PROCESSING + +test('process_options/4 throws on invalid options') :- + catch( + process_options([invalid(true)], _, _, _), + invalid_options_given([invalid(true)]), + true + ). + +test('process_options/4 sets valid options in opt records') :- + Options = [ + cased(false), % non-default preopt + strings(false), % non-default tokenopt + spaces(false) % non-default postopt + ], + process_options(Options, PreOpts, TokenOpts, PostOpts), + % Fetch the options that were set + preopts_data(cased, PreOpts, Cased), + tokenopts_data(strings, TokenOpts, Strings), + postopts_data(spaces, PostOpts, Spaces), + % These compounds are just ensure informative output on failure + assertion(cased:Cased == cased:false), + assertion(strings:Strings == strings:false), + assertion(spaces:Spaces == spaces:false). + +% NUMBERS + +test('tokenize 7.0', + [true(Actual == Expected)] + ) :- + tokenize("7.0", Actual), + Expected = [number(7.0)]. + +test('untokenize 6.3', + [true(Actual == Expected)] + ) :- + untokenize([number(6.3)], Actual), + Expected = `6.3`. + +test('tokenize number in other stuff', + [true(Actual == Expected)] + ) :- + tokenize("hi 7.0 x", Actual), + Expected = [word(hi), space(' '), number(7.0), space(' '), word(x)]. + +test('untokenize 6.3 in other stuff', + [true(Actual == Expected)] + ) :- + untokenize([word(hi), number(6.3)], Actual), + Expected = `hi6.3`. + +test('can disable number tokens', + [true(Actual == Expected)] + ) :- + tokenize("hi 7.0 x", Actual, [numbers(false)]), + Expected = [word(hi), space(' '), word('7'), punct('.'), word('0'), space(' '), word(x)]. + + +% STRINGS + +test('Tokenizing the empty strings', + [true(Actual == Expected)] + ) :- + tokenize(`""`, Actual), + Expected = [string('')]. + +test('Untokenizing an empty string', + [true(Actual == Expected)] + ) :- + untokenize([string('')], Actual), + Expected = `""`. + +test('Tokenizing a string with just two escapes', + [true(Actual == Expected)] + ) :- + tokenize(`"\\\\"`, Actual), + Expected = [string('\\\\')]. + +test('Untokenizing a string with just two characters', + [true(Actual == Expected)] + ) :- + untokenize([string('aa')], Actual), + Expected = `"aa"`. + +test('Extracts a string', + [true(Actual == Expected)] + ) :- + tokenize(`"a string"`, Actual), + Expected = [string('a string')]. + +test('Extracts a string among other stuff', + [true(Actual == Expected)] + ) :- + tokenize(`Some other "a string" stuff`, Actual), + Expected = [word(some),space(' '),word(other),space(' '),string('a string'),space(' '),word(stuff)]. + +test('Extracts a string that includes escaped brackets', + [true(Actual == Expected)] + ) :- + tokenize(`"a \\"string\\""`, Actual), + Expected = [string('a "string"')]. + +test('Tokenization preserves escaped characters', + [true(Actual == Expected)] + ) :- + tokenize(`"\\tLine text\\n"`, Actual), + Expected = [string('\\tline text\\n')]. + +test('Extracts a string that includes a doubly nested string', + [true(Actual == Expected)] + ) :- + tokenize(`"a \\"sub \\\\"string\\\\"\\""`, Actual), + Expected = [string('a "sub \\"string\\""')]. + +test('can disable string tokens', + [true(Actual == Expected)] + ) :- + tokenize(`some "string".`, Actual, [numbers(false)]), + Expected = [word(some), space(' '), string(string), punct('.')]. + +test('Untokenizes string things', + [true(Actual == Expected)] + ) :- + untokenize([string('some string')], Actual), + Expected = `"some string"`. + :- end_tests(tokenize).