diff --git a/CHANGELOG.md b/CHANGELOG.md index 19d8e22f5..b1518f920 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ also non string parameters (e.g. `Enum.join([1, 2], ",")` can be written directly to a buffer. - Support for `erlang:list_to_integer/2` - Add `externalterm_to_term_copy` that can be safely used from NIFs taking temporary buffers +- Add stub of `erl_features` and copy `erl_scan`, `erl_anno` modules from OTP ### Changed diff --git a/libs/estdlib/src/CMakeLists.txt b/libs/estdlib/src/CMakeLists.txt index ed0c11452..b16b32182 100644 --- a/libs/estdlib/src/CMakeLists.txt +++ b/libs/estdlib/src/CMakeLists.txt @@ -28,6 +28,9 @@ set(ERLANG_MODULES calendar code crypto + erl_anno + erl_features + erl_scan erts_debug gen_event gen_server diff --git a/libs/estdlib/src/erl_anno.erl b/libs/estdlib/src/erl_anno.erl new file mode 100644 index 000000000..46cce9aa0 --- /dev/null +++ b/libs/estdlib/src/erl_anno.erl @@ -0,0 +1,474 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2024. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% + +% +% This is shrinked down version of OTP erl_anno module +% + +-module(erl_anno). + + +-export([new/1, is_anno/1]). +-export([column/1, end_location/1, file/1, generated/1, + line/1, location/1, record/1, text/1]). +-export([set_file/2, set_generated/2, set_line/2, set_location/2, + set_record/2, set_text/2]). + +%% To be used when necessary to avoid Dialyzer warnings. +-export([to_term/1, from_term/1]). + +-export_type([anno/0, line/0, column/0, location/0, text/0]). + +-export_type([anno_term/0]). + +-define(LN(L), is_integer(L), L >= 0). +-define(COL(C), (is_integer(C) andalso C >= 1)). + +%% Location. +-define(LCOLUMN(C), ?COL(C)). +-define(LLINE(L), ?LN(L)). + +%% Debug: define DEBUG to make sure that annotations are handled as an +%% opaque type. Note that all abstract code need to be compiled with +%% DEBUG=true. See also ./erl_pp.erl and ./erl_parse.yrl. + +%-define(DEBUG, true). + +-type annotation() :: {'file', filename()} + | {'generated', generated()} + | {'location', location()} + | {'record', record()} + | {'text', string()}. + +-ifdef(DEBUG). +-opaque anno() :: [annotation(), ...]. +-else. +-doc "A collection of annotations.". +-opaque anno() :: location() | [annotation(), ...]. +-endif. +-doc """ +The term representing a collection of annotations. It is either a `t:location/0` +or a list of key-value pairs. +""". +-type anno_term() :: term(). + +-type column() :: pos_integer(). +-type generated() :: boolean(). +-type filename() :: string(). +-type line() :: non_neg_integer(). +-type location() :: line() | {line(), column()}. +-type record() :: boolean(). +-type text() :: string(). + +-ifdef(DEBUG). +%% Anything 'false' accepted by the compiler. +-define(ALINE(A), is_reference(A)). +-define(ACOLUMN(A), is_reference(A)). +-else. +-define(ALINE(L), ?LN(L)). +-define(ACOLUMN(C), ?COL(C)). +-endif. + +-doc """ +Returns the term representing the annotations Anno. + +See also [from_term()](`from_term/1`). +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec to_term(Anno) -> anno_term() when + Anno :: anno(). + +-ifdef(DEBUG). +to_term(Anno) -> + simplify(Anno). +-else. +to_term(Anno) -> + Anno. +-endif. + +-doc """ +Returns annotations with representation Term. + +See also [to_term()](`to_term/1`). +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec from_term(Term) -> Anno when + Term :: anno_term(), + Anno :: anno(). + +-ifdef(DEBUG). +from_term(Term) when is_list(Term) -> + Term; +from_term(Line) when is_integer(Line), Line < 0 -> % Before OTP 19 + set_generated(true, new(-Line)); +from_term(Term) -> + [{location, Term}]. +-else. +from_term(Line) when is_integer(Line), Line < 0 -> % Before OTP 19 + set_generated(true, new(-Line)); +from_term(Term) -> + Term. +-endif. + +-doc "Creates a new collection of annotations given a location.". +-doc(#{since => <<"OTP 18.0">>}). +-spec new(Location) -> anno() when + Location :: location(). + +new(Line) when ?LLINE(Line) -> + new_location(Line); +new({Line, Column}=Loc) when ?LLINE(Line), ?LCOLUMN(Column) -> + new_location(Loc); +new(Term) -> + erlang:error(badarg, [Term]). + +-ifdef(DEBUG). +new_location(Location) -> + [{location, Location}]. +-else. +new_location(Location) -> + Location. +-endif. + +-doc "Returns `true` if Term is a collection of annotations, otherwise `false`.". +-doc(#{since => <<"OTP 18.0">>}). +-spec is_anno(Term) -> boolean() when + Term :: any(). + +is_anno(Line) when ?ALINE(Line) -> + true; +is_anno({Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column) -> + true; +is_anno(Anno) -> + (Anno =/= [] andalso + is_anno1(Anno) andalso + lists:keymember(location, 1, Anno)). + +is_anno1([{Item, Value}|Anno]) -> + is_anno2(Item, Value) andalso is_anno1(Anno); +is_anno1(A) -> + A =:= []. + +is_anno2(location, Line) when ?LN(Line) -> + true; +is_anno2(location, {Line, Column}) when ?LN(Line), ?COL(Column) -> + true; +is_anno2(generated, true) -> + true; +is_anno2(file, Filename) -> + is_filename(Filename); +is_anno2(record, true) -> + true; +is_anno2(text, Text) -> + is_string(Text); +is_anno2(_, _) -> + false. + +is_filename(T) -> + is_list(T) orelse is_binary(T). + +is_string(T) -> + is_list(T). + +-doc "Returns the column of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec column(Anno) -> column() | 'undefined' when + Anno :: anno(). + +column({Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column) -> + Column; +column(Line) when ?ALINE(Line) -> + undefined; +column(Anno) -> + case location(Anno) of + {_Line, Column} -> + Column; + _Line -> + undefined + end. + +-doc """ +Returns the end location of the text of the annotations Anno. If there is no +text, `undefined` is returned. +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec end_location(Anno) -> location() | 'undefined' when + Anno :: anno(). + +end_location(Anno) -> + case text(Anno) of + undefined -> + undefined; + Text -> + case location(Anno) of + {Line, Column} -> + end_location(Text, Line, Column); + Line -> + end_location(Text, Line) + end + end. + +-doc """ +Returns the filename of the annotations Anno. If there is no filename, +`undefined` is returned. +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec file(Anno) -> filename() | 'undefined' when + Anno :: anno(). + +file(Line) when ?ALINE(Line) -> + undefined; +file({Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column) -> + undefined; +file(Anno) -> + anno_info(Anno, file). + +-doc """ +Returns `true` if annotations Anno is marked as generated. The default is to +return `false`. +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec generated(Anno) -> generated() when + Anno :: anno(). + +generated(Line) when ?ALINE(Line) -> + false; +generated({Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column) -> + false; +generated(Anno) -> + anno_info(Anno, generated, false). + +-doc "Returns the line of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec line(Anno) -> line() when + Anno :: anno(). + +line(Anno) -> + case location(Anno) of + {Line, _Column} -> + Line; + Line -> + Line + end. + +-doc "Returns the location of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec location(Anno) -> location() when + Anno :: anno(). + +location(Line) when ?ALINE(Line) -> + Line; +location({Line, Column}=Location) when ?ALINE(Line), ?ACOLUMN(Column) -> + Location; +location(Anno) -> + anno_info(Anno, location). + +-doc false. +-spec record(Anno) -> record() when + Anno :: anno(). + +record(Line) when ?ALINE(Line) -> + false; +record({Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column) -> + false; +record(Anno) -> + anno_info(Anno, record, false). + +-doc """ +Returns the text of the annotations Anno. If there is no text, `undefined` is +returned. +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec text(Anno) -> text() | 'undefined' when + Anno :: anno(). + +text(Line) when ?ALINE(Line) -> + undefined; +text({Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column) -> + undefined; +text(Anno) -> + anno_info(Anno, text). + +-doc "Modifies the filename of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec set_file(File, Anno) -> Anno when + File :: filename(), + Anno :: anno(). + +set_file(File, Anno) -> + set(file, File, Anno). + +-doc "Modifies the generated marker of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec set_generated(Generated, Anno) -> Anno when + Generated :: generated(), + Anno :: anno(). + +set_generated(Generated, Anno) -> + set(generated, Generated, Anno). + +-doc "Modifies the line of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec set_line(Line, Anno) -> Anno when + Line :: line(), + Anno :: anno(). + +set_line(Line, Anno) -> + case location(Anno) of + {_Line, Column} -> + set_location({Line, Column}, Anno); + _Line -> + set_location(Line, Anno) + end. + +-doc "Modifies the location of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec set_location(Location, Anno) -> Anno when + Location :: location(), + Anno :: anno(). + +set_location(Line, L) when ?ALINE(L), ?LLINE(Line) -> + new_location(Line); +set_location(Line, {L, Column}) when ?ALINE(L), ?ACOLUMN(Column), + ?LLINE(Line) -> + new_location(Line); +set_location({L, C}=Loc, Line) when ?ALINE(Line), ?LLINE(L), ?LCOLUMN(C) -> + new_location(Loc); +set_location({L, C}=Loc, {Line, Column}) when ?ALINE(Line), ?ACOLUMN(Column), + ?LLINE(L), ?LCOLUMN(C) -> + new_location(Loc); +set_location(Location, Anno) -> + set(location, Location, Anno). + +-doc "Modifies the record marker of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec set_record(Record, Anno) -> Anno when + Record :: record(), + Anno :: anno(). + +set_record(Record, Anno) -> + set(record, Record, Anno). + +-doc "Modifies the text of the annotations Anno.". +-doc(#{since => <<"OTP 18.0">>}). +-spec set_text(Text, Anno) -> Anno when + Text :: text(), + Anno :: anno(). + +set_text(Text, Anno) -> + set(text, Text, Anno). + +set(Item, Value, Anno) -> + case {is_settable(Item, Value), Anno} of + {true, Line} when ?ALINE(Line) -> + set_anno(Item, Value, [{location, Line}]); + {true, {L, C}=Location} when ?ALINE(L), ?ACOLUMN(C) -> + set_anno(Item, Value, [{location, Location}]); + {true, A} when is_list(A), A =/= [] -> + set_anno(Item, Value, Anno); + _ -> + erlang:error(badarg, [Item, Value, Anno]) + end. + +set_anno(Item, Value, Anno) -> + case default(Item, Value) of + true -> + reset(Anno, Item); + false -> + R = case anno_info(Anno, Item) of + undefined -> + [{Item, Value}|Anno]; + _ -> + lists:keyreplace(Item, 1, Anno, {Item, Value}) + end, + reset_simplify(R) + end. + +reset(Anno, Item) -> + A = lists:keydelete(Item, 1, Anno), + reset_simplify(A). + +-ifdef(DEBUG). +reset_simplify(A) -> + A. +-else. +reset_simplify(A) -> + simplify(A). +-endif. + +simplify([{location, Location}]) -> + Location; +simplify(Anno) -> + Anno. + +anno_info(Anno, Item, Default) -> + try lists:keyfind(Item, 1, Anno) of + false -> + Default; + {Item, Value} -> + Value + catch + _:_ -> + erlang:error(badarg, [Anno, Item, Default]) + end. + +anno_info(Anno, Item) -> + try lists:keyfind(Item, 1, Anno) of + {Item, Value} -> + Value; + false -> + undefined + catch + _:_ -> + erlang:error(badarg, [Anno, Item]) + end. + +end_location("", Line, Column) -> + {Line, Column}; +end_location([$\n|String], Line, _Column) -> + end_location(String, Line+1, 1); +end_location([_|String], Line, Column) -> + end_location(String, Line, Column+1). + +end_location("", Line) -> + Line; +end_location([$\n|String], Line) -> + end_location(String, Line+1); +end_location([_|String], Line) -> + end_location(String, Line). + +is_settable(file, File) -> + is_filename(File); +is_settable(generated, Boolean) when Boolean; not Boolean -> + true; +is_settable(location, Line) when ?LLINE(Line) -> + true; +is_settable(location, {Line, Column}) when ?LLINE(Line), ?LCOLUMN(Column) -> + true; +is_settable(record, Boolean) when Boolean; not Boolean -> + true; +is_settable(text, Text) -> + is_string(Text); +is_settable(_, _) -> + false. + +default(generated, false) -> true; +default(record, false) -> true; +default(_, _) -> false. diff --git a/libs/estdlib/src/erl_features.erl b/libs/estdlib/src/erl_features.erl new file mode 100644 index 000000000..6ae295ad4 --- /dev/null +++ b/libs/estdlib/src/erl_features.erl @@ -0,0 +1,25 @@ +% +% This file is part of AtomVM. +% +% Copyright 2024 Jakub Gonet +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +-module(erl_features). + +-export([keywords/0]). + +keywords() -> ['maybe', 'else']. diff --git a/libs/estdlib/src/erl_scan.erl b/libs/estdlib/src/erl_scan.erl new file mode 100644 index 000000000..313d89ee1 --- /dev/null +++ b/libs/estdlib/src/erl_scan.erl @@ -0,0 +1,2189 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2024. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% + +%% Erlang token scanning functions of io library. + +%% For handling ISO 8859-1 (Latin-1) we use the following type +%% information: +%% +%% 000 - 037 NUL - US control +%% 040 - 057 SPC - / punctuation +%% 060 - 071 0 - 9 digit +%% 072 - 100 : - @ punctuation +%% 101 - 132 A - Z uppercase +%% 133 - 140 [ - ` punctuation +%% 141 - 172 a - z lowercase +%% 173 - 176 { - ~ punctuation +%% 177 DEL control +%% 200 - 237 control +%% 240 - 277 NBSP - ¿ punctuation +%% 300 - 326 À - Ö uppercase +%% 327 × punctuation +%% 330 - 336 Ø - Þ uppercase +%% 337 - 366 ß - ö lowercase +%% 367 ÷ punctuation +%% 370 - 377 ø - ÿ lowercase +%% +%% Many punctuation characters have special meaning: +%% $\s, $_, $", $$, $%, $', $. +%% DEL is a punctuation. +%% +%% Must watch using × \327, very close to x \170. + +-module(erl_scan). +-moduledoc """ +The Erlang token scanner. + +This module contains functions for tokenizing (scanning) characters into Erlang +tokens. + +## Error Information + +`ErrorInfo` is the standard `ErrorInfo` structure that is returned from all I/O +modules. The format is as follows: + +```erlang +{ErrorLocation, Module, ErrorDescriptor} +``` + +A string describing the error is obtained with the following call: + +```erlang +Module:format_error(ErrorDescriptor) +``` + +## Notes + +The continuation of the first call to the re-entrant input functions must be +`[]`. For a complete description of how the re-entrant input scheme works, see +Armstrong, Virding and Williams: 'Concurrent Programming in Erlang', Chapter 13. + +## See Also + +`m:erl_anno`, `m:erl_parse`, `m:io` +""". + +%%% External exports + +-export([string/1,string/2,string/3,tokens/3,tokens/4, + format_error/1,reserved_word/1, + f_reserved_word/1]). + +-export([column/1,end_location/1,line/1,location/1,text/1, + category/1,symbol/1]). + +%%% Private +-export([continuation_location/1]). + +-export_type([error_info/0, + options/0, + return_cont/0, + token/0, tokens/0, + tokens_result/0]). + +%% Removed functions and types +-removed([{set_attribute,3,"use erl_anno:set_line/2 instead"}, + {attributes_info,'_', + "use erl_anno:{column,line,location,text}/1 instead"}, + {token_info,'_', + "use erl_scan:{category,column,line,location,symbol,text}/1 instead"}]). + +-removed_type([{column,0,"use erl_anno:column() instead"}, + {line,0,"use erl_anno:line() instead"}, + {location,0,"use erl_anno:location() instead"}]). + +%%% +%%% Defines and type definitions +%%% + +-define(COLUMN(C), (is_integer(C) andalso C >= 1)). +%% Line numbers less than zero have always been allowed: +-define(ALINE(L), is_integer(L)). +-define(STRING(S), is_list(S)). +-define(RESWORDFUN(F), is_function(F, 1)). + +-type category() :: atom(). +-type resword_fun() :: fun((atom()) -> boolean()). +-type text_fun() :: fun((atom(), string()) -> boolean()). +-type option() :: 'return' | 'return_white_spaces' | 'return_comments' + | 'text' | {'reserved_word_fun', resword_fun()} + | {'text_fun', text_fun()} | {'compiler_internal', [term()]}. +-type options() :: option() | [option()]. +-type symbol() :: atom() | float() | integer() | string(). +-type token() :: {category(), Anno :: erl_anno:anno(), symbol()} + | {category(), Anno :: erl_anno:anno()}. +-type tokens() :: [token()]. +-type error_description() :: term(). +-type error_info() :: {erl_anno:location(), module(), error_description()}. + +%%% Local record. +-record(erl_scan, + {resword_fun = fun reserved_word/1 :: resword_fun(), + text_fun = fun(_, _) -> false end :: text_fun(), + ws = false :: boolean(), + comment = false :: boolean(), + has_fun = false :: boolean(), + %% True if requested to parse %ssa%-check comments + checks = false :: boolean(), + %% True if we're scanning inside a %ssa%-check comment + in_check = false :: boolean()}). + +%%---------------------------------------------------------------------------- + +-doc """ +Uses an `ErrorDescriptor` and returns a string that describes the error or +warning. This function is usually called implicitly when an `ErrorInfo` +structure is processed (see section +[Error Information](`m:erl_scan#module-error-information`)). +""". +-spec format_error(ErrorDescriptor) -> string() when + ErrorDescriptor :: error_description(). +format_error({unterminated,char}) -> "unterminated character"; +format_error({unterminated,What,Head}) -> + %% The reported position should be the first char in Head + lists:flatten( + ["unterminated ", string_thing(What), " starting with ", + io_lib:write_string(Head, string_quote(What))]); +format_error({illegal,Type}) -> + lists:flatten(io_lib:fwrite("illegal ~w", [Type])); +format_error({base,Base}) -> + lists:flatten(io_lib:fwrite("illegal base '~w'", [Base])); +format_error(indentation) -> + "bad indentation in triple-quoted string"; +format_error(white_space) -> + "not white space after start of triple-quoted string"; +format_error(string_concat) -> + "adjacent string literals without intervening white space"; +format_error(Other) -> + lists:flatten(io_lib:write(Other)). + +-doc(#{equiv => string(String, 1)}). +-spec string(String) -> Return when + String :: string(), + Return :: {'ok', Tokens :: tokens(), EndLocation} + | {'error', ErrorInfo :: error_info(), ErrorLocation}, + EndLocation :: erl_anno:location(), + ErrorLocation :: erl_anno:location(). +string(String) -> + string(String, 1, []). + +-doc(#{equiv => string(String, StartLocation, [])}). +-spec string(String, StartLocation) -> Return when + String :: string(), + Return :: {'ok', Tokens :: tokens(), EndLocation} + | {'error', ErrorInfo :: error_info(), ErrorLocation}, + StartLocation :: erl_anno:location(), + EndLocation :: erl_anno:location(), + ErrorLocation :: erl_anno:location(). +string(String, StartLocation) -> + string(String, StartLocation, []). + +-doc """ +Takes the list of characters `String` and tries to scan (tokenize) them. + +Returns one of the following: + +- **`{ok, Tokens, EndLocation}`** - `Tokens` are the Erlang tokens from + `String`. `EndLocation` is the first location after the last token. + +- **`{error, ErrorInfo, ErrorLocation}`** - An error occurred. `ErrorLocation` + is the first location after the erroneous token. + +`StartLocation` indicates the initial location when scanning starts. If +`StartLocation` is a line, `Anno`, `EndLocation`, and `ErrorLocation` are lines. +If `StartLocation` is a pair of a line and a column, `Anno` takes the form of an +opaque compound data type, and `EndLocation` and `ErrorLocation` are pairs of a +line and a column. The _token annotations_ contain information about the column +and the line where the token begins, as well as the text of the token (if option +`text` is specified), all of which can be accessed by calling `column/1`, +`line/1`, `location/1`, and `text/1`. + +A _token_ is a tuple containing information about syntactic category, the token +annotations, and the terminal symbol. For punctuation characters (such as `;` +and `|`) and reserved words, the category and the symbol coincide, and the token +is represented by a two-tuple. Three-tuples have one of the following forms: + +- `{atom, Anno, atom()}` +- `{char, Anno, char()}` +- `{comment, Anno, string()}` +- `{float, Anno, float()}` +- `{integer, Anno, integer()}` +- `{var, Anno, atom()}` +- `{white_space, Anno, string()}` + +Valid options: + +- **`{reserved_word_fun, reserved_word_fun()}`** - A callback function that is + called when the scanner has found an unquoted atom. If the function returns + `true`, the unquoted atom itself becomes the category of the token. If the + function returns `false`, `atom` becomes the category of the unquoted atom. + +- **`return_comments`** - Return comment tokens. + +- **`return_white_spaces`** - Return white space tokens. By convention, a + newline character, if present, is always the first character of the text + (there cannot be more than one newline in a white space token). + +- **`return`** - Short for `[return_comments, return_white_spaces]`. + +- **`text`{: #text }** - Include the token text in the token annotation. The + text is the part of the input corresponding to the token. See also + [`text_fun`](`m:erl_scan#text_fun`). + +- **`{text_fun, text_fun()}`{: #text_fun }** - A callback function used to + determine whether the full text for the token shall be included in the token + annotation. Arguments of the function are the category of the token and the + full token string. This is only used when [`text`](`m:erl_scan#text`) is not + present. If neither are present the text will not be saved in the token + annotation. + +- **`{compiler_internal, term()}`{: #compiler_interal }** - Pass + compiler-internal options to the scanner. The set of internal options + understood by the scanner should be considered experimental and can thus be + changed at any time without prior warning. + + The following options are currently understood: + + - **`ssa_checks`** - Tokenizes source code annotations used for encoding tests + on the BEAM SSA code produced by the compiler. +""". +-spec string(String, StartLocation, Options) -> Return when + String :: string(), + Options :: options(), + Return :: {'ok', Tokens :: tokens(), EndLocation} + | {'error', ErrorInfo :: error_info(), ErrorLocation}, + StartLocation :: erl_anno:location(), + EndLocation :: erl_anno:location(), + ErrorLocation :: erl_anno:location(). +string(String, Line, Options) when ?STRING(String), ?ALINE(Line) -> + string1(String, options(Options), Line, no_col, []); +string(String, {Line,Column}, Options) when ?STRING(String), + ?ALINE(Line), + ?COLUMN(Column) -> + string1(String, options(Options), Line, Column, []). + +-type char_spec() :: string() | 'eof'. +-type cont_fun() :: fun((char_spec(), #erl_scan{}, + erl_anno:line(), erl_anno:column(), + tokens(), any()) -> any()). +-opaque return_cont() :: {erl_scan_continuation, + string(), erl_anno:column(), tokens(), + erl_anno:line(), + #erl_scan{}, any(), cont_fun()}. +-type tokens_result() :: {'ok', Tokens :: tokens(), + EndLocation :: erl_anno:location()} + | {'eof', EndLocation :: erl_anno:location()} + | {'error', ErrorInfo :: error_info(), + EndLocation :: erl_anno:location()}. + +-doc(#{equiv => tokens(Continuation, CharSpec, StartLocation, [])}). +-spec tokens(Continuation, CharSpec, StartLocation) -> Return when + Continuation :: return_cont() | [], + CharSpec :: char_spec(), + StartLocation :: erl_anno:location(), + Return :: {'done',Result :: tokens_result(),LeftOverChars :: char_spec()} + | {'more', Continuation1 :: return_cont()}. +tokens(Cont, CharSpec, StartLocation) -> + tokens(Cont, CharSpec, StartLocation, []). + +-doc """ +This is the re-entrant scanner, which scans characters until either a _dot_ ('.' +followed by a white space) or `eof` is reached. + +It returns: + +- **`{done, Result, LeftOverChars}`** - Indicates that there is sufficient input + data to get a result. `Result` is: + + - **`{ok, Tokens, EndLocation}`** - The scanning was successful. `Tokens` is + the list of tokens including _dot_. + + - **`{eof, EndLocation}`** - End of file was encountered before any more + tokens. + + - **`{error, ErrorInfo, EndLocation}`** - An error occurred. `LeftOverChars` + is the remaining characters of the input data, starting from `EndLocation`. + +- **`{more, Continuation1}`** - More data is required for building a term. + `Continuation1` must be passed in a new call to `tokens/3,4` when more data is + available. + +The `CharSpec` `eof` signals end of file. `LeftOverChars` then takes the value +`eof` as well. + +For a description of the options, see `string/3`. +""". +-spec tokens(Continuation, CharSpec, StartLocation, Options) -> Return when + Continuation :: return_cont() | [], + CharSpec :: char_spec(), + StartLocation :: erl_anno:location(), + Options :: options(), + Return :: {'done',Result :: tokens_result(),LeftOverChars :: char_spec()} + | {'more', Continuation1 :: return_cont()}. +tokens([], CharSpec, Line, Options) when ?ALINE(Line) -> + tokens1(CharSpec, options(Options), Line, no_col, [], fun scan/6, []); +tokens([], CharSpec, {Line,Column}, Options) when ?ALINE(Line), + ?COLUMN(Column) -> + tokens1(CharSpec, options(Options), Line, Column, [], fun scan/6, []); +tokens({erl_scan_continuation,Cs,Col,Toks,Line,St,Any,Fun}, + CharSpec, _Loc, _Opts) -> + tokens1(Cs++CharSpec, St, Line, Col, Toks, Fun, Any). + +-doc false. +continuation_location({erl_scan_continuation,_,no_col,_,Line,_,_,_}) -> + Line; +continuation_location({erl_scan_continuation,_,Col,_,Line,_,_,_}) -> + {Line,Col}. + +-doc "Returns the column of `Token`'s collection of annotations.". +-doc(#{since => <<"OTP 18.0">>}). +-spec column(Token) -> erl_anno:column() | 'undefined' when + Token :: token(). + +column(Token) -> + erl_anno:column(element(2, Token)). + +-doc """ +Returns the end location of the text of `Token`'s collection of annotations. If +there is no text, `undefined` is returned. +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec end_location(Token) -> erl_anno:location() | 'undefined' when + Token :: token(). + +end_location(Token) -> + erl_anno:end_location(element(2, Token)). + +-doc "Returns the line of `Token`'s collection of annotations.". +-doc(#{since => <<"OTP 18.0">>}). +-spec line(Token) -> erl_anno:line() when + Token :: token(). + +line(Token) -> + erl_anno:line(element(2, Token)). + +-doc "Returns the location of `Token`'s collection of annotations.". +-doc(#{since => <<"OTP 18.0">>}). +-spec location(Token) -> erl_anno:location() when + Token :: token(). + +location(Token) -> + erl_anno:location(element(2, Token)). + +-doc """ +Returns the text of `Token`'s collection of annotations. If there is no text, +`undefined` is returned. +""". +-doc(#{since => <<"OTP 18.0">>}). +-spec text(Token) -> erl_anno:text() | 'undefined' when + Token :: token(). + +text(Token) -> + erl_anno:text(element(2, Token)). + +-doc "Returns the category of `Token`.". +-doc(#{since => <<"OTP 18.0">>}). +-spec category(Token) -> category() when + Token :: token(). + +category({Category,_Anno}) -> + Category; +category({Category,_Anno,_Symbol}) -> + Category; +category(T) -> + erlang:error(badarg, [T]). + +-doc "Returns the symbol of `Token`.". +-doc(#{since => <<"OTP 18.0">>}). +-spec symbol(Token) -> symbol() when + Token :: token(). + +symbol({Category,_Anno}) -> + Category; +symbol({_Category,_Anno,Symbol}) -> + Symbol; +symbol(T) -> + erlang:error(badarg, [T]). + +%%% +%%% Local functions +%%% + +string_thing(What) -> + case What of + atom -> + "atom"; + string -> + "string"; + {string,_N} -> + "triple-quoted string"; + {sigil,Name,Q1,Q2} -> + [$~,atom_to_list(Name),Q1,Q2," sigil string"]; + {sigil,Name,_N} -> + "triple-quoted " ++ [$~,atom_to_list(Name) | " sigil string"] + end. + +string_quote(atom) -> $'; %' +string_quote(_) -> $". %" + +-define(WHITE_SPACE(C), + (is_integer(C) andalso + (C >= $\000 andalso C =< $\s orelse C >= $\200 andalso C =< $\240))). +-define(DIGIT(C), (is_integer(C) andalso $0 =< C andalso C =< $9)). +-define(CHAR(C), (is_integer(C) andalso 0 =< C andalso C < 16#110000)). +-define(UNICODE(C), + (is_integer(C) andalso + (C >= 0 andalso C < 16#D800 orelse + C > 16#DFFF andalso C < 16#FFFE orelse + C > 16#FFFF andalso C =< 16#10FFFF))). + +-define(UNI255(C), (is_integer(C) andalso 0 =< C andalso C =< 16#ff)). + +-define(NO_SIGIL, {}). + +options(Opts0) when is_list(Opts0) -> + Opts = lists:foldr(fun expand_opt/2, [], Opts0), + [RW_fun] = + case opts(Opts, [reserved_word_fun], []) of + badarg -> + erlang:error(badarg, [Opts0]); + R -> + R + end, + Comment = proplists:get_bool(return_comments, Opts), + WS = proplists:get_bool(return_white_spaces, Opts), + Txt = proplists:get_bool(text, Opts), + TxtFunOpt = proplists:get_value(text_fun, Opts, none), + Internal = proplists:get_value(compiler_internal, Opts, []), + Checks = proplists:get_bool(ssa_checks, Internal), + DefTxtFun = fun(_, _) -> Txt end, + {HasFun, TxtFun} = + if + Txt -> {Txt, DefTxtFun}; + TxtFunOpt == none -> {Txt, DefTxtFun}; + true -> {true, TxtFunOpt} + end, + #erl_scan{resword_fun = RW_fun, + comment = Comment, + ws = WS, + text_fun = TxtFun, + has_fun = HasFun, + checks = Checks}; +options(Opt) -> + options([Opt]). + +opts(Options, [Key|Keys], L) -> + V = case lists:keyfind(Key, 1, Options) of + {reserved_word_fun,F} when ?RESWORDFUN(F) -> + {ok,F}; + {Key,_} -> + badarg; + false -> + {ok,default_option(Key)} + end, + case V of + badarg -> + badarg; + {ok,Value} -> + opts(Options, Keys, [Value|L]) + end; +opts(_Options, [], L) -> + lists:reverse(L). + +default_option(reserved_word_fun) -> + fun reserved_word/1. + +expand_opt(return, Os) -> + [return_comments,return_white_spaces|Os]; +expand_opt(O, Os) -> + [O|Os]. + +tokens1(Cs, St, Line, Col, Toks, Fun, Any) when ?STRING(Cs); Cs =:= eof -> + case Fun(Cs, St, Line, Col, Toks, Any) of + {more,{Cs0,Nst,Ncol,Ntoks,Nline,Nany,Nfun}} -> + {more,{erl_scan_continuation,Cs0,Ncol,Ntoks,Nline,Nst,Nany,Nfun}}; + {ok,Toks0,eof,Nline,Ncol} -> + Res = case Toks0 of + [] -> + {eof,location(Nline, Ncol)}; + _ -> + {ok,lists:reverse(Toks0),location(Nline,Ncol)} + end, + {done,Res,eof}; + {ok,Toks0,Rest,Nline,Ncol} -> + {done,{ok,lists:reverse(Toks0),location(Nline, Ncol)},Rest}; + {{error,_,_}=Error,Rest} -> + {done,Error,Rest} + end. + +string1(Cs, St, Line, Col, Toks) -> + case scan1(Cs, St, Line, Col, Toks) of + {more,{Cs0,Nst,Ncol,Ntoks,Nline,Any,Fun}} -> + case Fun(Cs0++eof, Nst, Nline, Ncol, Ntoks, Any) of + {ok,Toks1,_Rest,Line2,Col2} -> + {ok,lists:reverse(Toks1),location(Line2, Col2)}; + {{error,_,_}=Error,_Rest} -> + Error + end; + {ok,Ntoks,[_|_]=Rest,Nline,Ncol} -> + string1(Rest, St, Nline, Ncol, Ntoks); + {ok,Ntoks,_,Nline,Ncol} -> + {ok,lists:reverse(Ntoks),location(Nline, Ncol)}; + {{error,_,_}=Error,_Rest} -> + Error + end. + + +scan(Cs, #erl_scan{}=St, Line, Col, Toks, _) -> + scan1(Cs, St, Line, Col, Toks). + +scan1([$\s|Cs], St, Line, Col, Toks) when St#erl_scan.ws -> + scan_spcs(Cs, St, Line, Col, Toks, 1); +scan1([$\s|Cs], St, Line, Col, Toks) -> + skip_white_space(Cs, St, Line, Col, Toks, 1); +scan1([$\n|Cs], St, Line, Col, Toks) when St#erl_scan.ws -> + scan_newline(Cs, St, Line, Col, Toks); +scan1([$\n|Cs], St, Line, Col, Toks) -> + skip_white_space(Cs, St, Line+1, new_column(Col, 1), Toks, 0); +%% Optimization: some very common punctuation characters: +scan1([$,|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ",", ',', 1); +scan1([$(|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "(", '(', 1); +scan1([$)|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ")", ')', 1); +scan1([${|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "{", '{', 1); +scan1([$}|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "}", '}', 1); +scan1([$[|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "[", '[', 1); +scan1([$]|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "]", ']', 1); +scan1([$;|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ";", ';', 1); +scan1([$_=C|Cs], St, Line, Col, Toks) -> + scan_variable(Cs, St, Line, Col, Toks, [C]); +scan1([$\%=C|Cs], St, Line, Col, Toks) when St#erl_scan.checks -> + scan_check(Cs, St, Line, Col, Toks, [C]); +scan1([$\%|Cs], St, Line, Col, Toks) when not St#erl_scan.comment -> + skip_comment(Cs, St, Line, Col, Toks, 1); +scan1([$\%=C|Cs], St, Line, Col, Toks) -> + scan_comment(Cs, St, Line, Col, Toks, [C]); +%% More punctuation characters below. +scan1([C|_], _St, _Line, _Col0, _Toks) when not ?CHAR(C) -> + error({not_character,C}); +scan1([C|Cs], St, Line, Col, Toks) when C >= $A, C =< $Z -> + scan_variable(Cs, St, Line, Col, Toks, [C]); +scan1([C|Cs], St, Line, Col, Toks) when C >= $a, C =< $z -> + scan_atom(Cs, St, Line, Col, Toks, [C]); +scan1([C|Cs], St, Line, Col, Toks) when ?DIGIT(C) -> + scan_number(Cs, St, Line, Col, Toks, [C], no_underscore); +scan1("..."++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "...", '...', 3); +scan1(".."=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +scan1(".."++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "..", '..', 2); +scan1("."=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +scan1([$.=C|Cs], St, Line, Col, Toks) -> + scan_dot(Cs, St, Line, Col, Toks, [C]); +scan1([$'|Cs], St, Line, Col, Toks) -> %' Emacs + scan_qatom(Cs, St, Line, Col, Toks); +scan1([$"|_]=Cs, St, Line, Col, Toks) -> %" Emacs + scan_string(Cs, St, Line, Col, Toks, ?NO_SIGIL); +scan1([$~=C|Cs], St, Line, Col, Toks) -> + scan_sigil_prefix(Cs, St, Line, Col, Toks, [C]); +scan1([$$|Cs], St, Line, Col, Toks) -> + scan_char(Cs, St, Line, Col, Toks); +scan1([$\r|Cs], St, Line, Col, Toks) when St#erl_scan.ws -> + white_space_end(Cs, St, Line, Col, Toks, 1, "\r"); +scan1([C|Cs], St, Line, Col, Toks) when C >= $ß, C =< $ÿ, C =/= $÷ -> + scan_atom(Cs, St, Line, Col, Toks, [C]); +scan1([C|Cs], St, Line, Col, Toks) when C >= $À, C =< $Þ, C /= $× -> + scan_variable(Cs, St, Line, Col, Toks, [C]); +scan1([$\t|Cs], St, Line, Col, Toks) when St#erl_scan.ws -> + scan_tabs(Cs, St, Line, Col, Toks, 1); +scan1([$\t|Cs], St, Line, Col, Toks) -> + skip_white_space(Cs, St, Line, Col, Toks, 1); +scan1([C|Cs], St, Line, Col, Toks) when ?WHITE_SPACE(C) -> + case St#erl_scan.ws of + true -> + scan_white_space(Cs, St, Line, Col, Toks, [C]); + false -> + skip_white_space(Cs, St, Line, Col, Toks, 1) + end; +%% Punctuation characters and operators, first recognise multiples. +%% ?= for the maybe ... else ... end construct +scan1("?="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "?=", '?=', 2); +scan1("?"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% << <- <= +scan1("<<"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "<<", '<<', 2); +scan1("<-"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "<-", '<-', 2); +scan1("<="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "<=", '<=', 2); +scan1("<"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% >> >= +scan1(">>"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ">>", '>>', 2); +scan1(">="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ">=", '>=', 2); +scan1(">"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% -> -- +scan1("->"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "->", '->', 2); +scan1("--"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "--", '--', 2); +scan1("-"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% ++ +scan1("++"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "++", '++', 2); +scan1("+"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% =:= =/= =< == => +scan1("=:="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "=:=", '=:=', 3); +scan1("=:"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +scan1("=/="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "=/=", '=/=', 3); +scan1("=/"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +scan1("=<"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "=<", '=<', 2); +scan1("=>"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "=>", '=>', 2); +scan1("=="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "==", '==', 2); +scan1("="=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% /= +scan1("/="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "/=", '/=', 2); +scan1("/"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% || +scan1("||"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "||", '||', 2); +scan1("|"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% := +scan1(":="++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ":=", ':=', 2); +%% :: for typed records +scan1("::"++Cs, St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "::", '::', 2); +scan1(":"=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +%% Optimization: punctuation characters less than 127: +scan1([$=|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "=", '=', 1); +scan1([$:|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ":", ':', 1); +scan1([$||Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "|", '|', 1); +scan1([$#|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "#", '#', 1); +scan1([$/|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "/", '/', 1); +scan1([$?|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "?", '?', 1); +scan1([$-|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "-", '-', 1); +scan1([$+|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "+", '+', 1); +scan1([$*|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "*", '*', 1); +scan1([$<|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "<", '<', 1); +scan1([$>|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, ">", '>', 1); +scan1([$!|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "!", '!', 1); +scan1([$@|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "@", '@', 1); +scan1([$\\|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "\\", '\\', 1); +scan1([$^|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "^", '^', 1); +scan1([$`|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "`", '`', 1); +scan1([$~|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "~", '~', 1); +scan1([$&|Cs], St, Line, Col, Toks) -> + tok2(Cs, St, Line, Col, Toks, "&", '&', 1); +%% End of optimization. +scan1([C|Cs], St, Line, Col, Toks) when ?UNI255(C) -> + Str = [C], + tok2(Cs, St, Line, Col, Toks, Str, list_to_atom(Str), 1); +scan1([C|Cs], _St, Line, Col, _Toks) when ?CHAR(C) -> + Ncol = incr_column(Col, 1), + scan_error({illegal,character}, Line, Col, Line, Ncol, Cs); +scan1([]=Cs, St, Line, Col, Toks) -> + {more,{Cs,St,Col,Toks,Line,[],fun scan/6}}; +scan1(eof=Cs, _St, Line, Col, Toks) -> + {ok,Toks,Cs,Line,Col}. + +scan_atom_fun(Cs, #erl_scan{}=St, Line, Col, Toks, Ncs) -> + scan_atom(Cs, St, Line, Col, Toks, Ncs). + +scan_atom(Cs0, St, Line, Col, Toks, Ncs0) -> + case scan_name(Cs0, Ncs0) of + {more,Ncs} -> + {more,{[],St,Col,Toks,Line,Ncs,fun scan_atom_fun/6}}; + {WcsR,Cs} -> + Wcs = lists:reverse(WcsR), + try list_to_atom(Wcs) of + Name -> + case (St#erl_scan.resword_fun)(Name) of + true -> + tok2(Cs, St, Line, Col, Toks, Wcs, Name); + false -> + tok3(Cs, St, Line, Col, Toks, atom, Wcs, Name) + end + catch + _:_ -> + Ncol = incr_column(Col, length(Wcs)), + scan_error({illegal,atom}, Line, Col, Line, Ncol, Cs) + end + end. + + +scan_variable_fun(Cs, #erl_scan{}=St, Line, Col, Toks, Ncs) -> + scan_variable(Cs, St, Line, Col, Toks, Ncs). + +scan_variable(Cs0, St, Line, Col, Toks, Ncs0) -> + case scan_name(Cs0, Ncs0) of + {more,Ncs} -> + {more,{[],St,Col,Toks,Line,Ncs,fun scan_variable_fun/6}}; + {WcsR,Cs} -> + Wcs = lists:reverse(WcsR), + try list_to_atom(Wcs) of + Name -> + tok3(Cs, St, Line, Col, Toks, var, Wcs, Name) + catch + _:_ -> + Ncol = incr_column(Col, length(Wcs)), + scan_error({illegal,var}, Line, Col, Line, Ncol, Cs) + end + end. + + +scan_name([C|_]=Cs, Wcs) when not ?CHAR(C) -> + {Wcs,Cs}; +scan_name([C|Cs], Wcs) when C >= $a, C =< $z -> + scan_name(Cs, [C|Wcs]); +scan_name([C|Cs], Wcs) when C >= $A, C =< $Z -> + scan_name(Cs, [C|Wcs]); +scan_name([$_=C|Cs], Wcs) -> + scan_name(Cs, [C|Wcs]); +scan_name([C|Cs], Wcs) when ?DIGIT(C) -> + scan_name(Cs, [C|Wcs]); +scan_name([$@=C|Cs], Wcs) -> + scan_name(Cs, [C|Wcs]); +scan_name([C|Cs], Wcs) when C >= $ß, C =< $ÿ, C =/= $÷ -> + scan_name(Cs, [C|Wcs]); +scan_name([C|Cs], Wcs) when C >= $À, C =< $Þ, C =/= $× -> + scan_name(Cs, [C|Wcs]); +scan_name([], Wcs) -> + {more,Wcs}; +scan_name(Cs, Wcs) -> + {Wcs,Cs}. + + +-define(STR(Cl, St, S), + case ((St)#erl_scan.has_fun) + andalso ((St)#erl_scan.text_fun)((Cl), begin S end) of + true -> begin S end; + false -> [] + end). +-define(STR(Cl, St, Tmp, S), + case ((St)#erl_scan.has_fun) of + true -> + Tmp = begin S end, + case ((St)#erl_scan.text_fun)((Cl), Tmp) of + true -> Tmp; + false -> [] + end; + false -> [] + end). + +scan_dot([C|_]=Cs, St, Line, Col, Toks, Ncs) + when St#erl_scan.in_check, C =/= $. -> + tok2(Cs, St#erl_scan{in_check=false}, Line, Col, Toks, Ncs, '.', 1); +scan_dot([$%|_]=Cs, St, Line, Col, Toks, Ncs) -> + Anno = anno(Line, Col, St, ?STR(dot, St, Ncs)), + {ok,[{dot,Anno}|Toks],Cs,Line,incr_column(Col, 1)}; +scan_dot([$\n=C|Cs], St, Line, Col, Toks, Ncs) -> + Anno = anno(Line, Col, St, ?STR(dot, St, Ncs++[C])), + {ok,[{dot,Anno}|Toks],Cs,Line+1,new_column(Col, 1)}; +scan_dot([C|Cs], St, Line, Col, Toks, Ncs) when ?WHITE_SPACE(C) -> + Anno = anno(Line, Col, St, ?STR(dot, St, Ncs++[C])), + {ok,[{dot,Anno}|Toks],Cs,Line,incr_column(Col, 2)}; +scan_dot(eof=Cs, St, Line, Col, Toks, Ncs) -> + Anno = anno(Line, Col, St, ?STR(dot, St, Ncs)), + {ok,[{dot,Anno}|Toks],Cs,Line,incr_column(Col, 1)}; +scan_dot(Cs, St, Line, Col, Toks, Ncs) -> + tok2(Cs, St, Line, Col, Toks, Ncs, '.', 1). + +%%% White space characters are very common, so it is worthwhile to +%%% scan them fast and store them compactly. (The words "whitespace" +%%% and "white space" usually mean the same thing. The Erlang +%%% specification denotes the characters with ASCII code in the +%%% interval 0 to 32 as "white space".) +%%% +%%% Convention: if there is a white newline ($\n) it will always be +%%% the first character in the text string. As a consequence, there +%%% cannot be more than one newline in a white_space token string. +%%% +%%% Some common combinations are recognized, some are not. Examples +%%% of the latter are tab(s) followed by space(s), like "\t ". +%%% (They will be represented by two (or more) tokens.) +%%% +%%% Note: the character sequence "\r\n" is *not* recognized since it +%%% would violate the property that $\n will always be the first +%%% character. (But since "\r\n\r\n" is common, it pays off to +%%% recognize "\n\r".) + +scan_newline([$\s|Cs], St, Line, Col, Toks) -> + scan_nl_spcs(Cs, St, Line, Col, Toks, 2); +scan_newline([$\t|Cs], St, Line, Col, Toks) -> + scan_nl_tabs(Cs, St, Line, Col, Toks, 2); +scan_newline([$\r|Cs], St, Line, Col, Toks) -> + newline_end(Cs, St, Line, Col, Toks, 2, "\n\r"); +scan_newline([$\f|Cs], St, Line, Col, Toks) -> + newline_end(Cs, St, Line, Col, Toks, 2, "\n\f"); +scan_newline([], St, Line, Col, Toks) -> + {more,{[$\n],St,Col,Toks,Line,[],fun scan/6}}; +scan_newline(Cs, St, Line, Col, Toks) -> + scan_nl_white_space(Cs, St, Line, Col, Toks, "\n"). + +scan_nl_spcs_fun(Cs, #erl_scan{}=St, Line, Col, Toks, N) + when is_integer(N) -> + scan_nl_spcs(Cs, St, Line, Col, Toks, N). + +scan_nl_spcs([$\s|Cs], St, Line, Col, Toks, N) when N < 17 -> + scan_nl_spcs(Cs, St, Line, Col, Toks, N+1); +scan_nl_spcs([]=Cs, St, Line, Col, Toks, N) -> + {more,{Cs,St,Col,Toks,Line,N,fun scan_nl_spcs_fun/6}}; +scan_nl_spcs(Cs, St, Line, Col, Toks, N) -> + newline_end(Cs, St, Line, Col, Toks, N, nl_spcs(N)). + +scan_nl_tabs_fun(Cs, #erl_scan{}=St, Line, Col, Toks, N) + when is_integer(N) -> + scan_nl_tabs(Cs, St, Line, Col, Toks, N). + +scan_nl_tabs([$\t|Cs], St, Line, Col, Toks, N) when N < 11 -> + scan_nl_tabs(Cs, St, Line, Col, Toks, N+1); +scan_nl_tabs([]=Cs, St, Line, Col, Toks, N) -> + {more,{Cs,St,Col,Toks,Line,N,fun scan_nl_tabs_fun/6}}; +scan_nl_tabs(Cs, St, Line, Col, Toks, N) -> + newline_end(Cs, St, Line, Col, Toks, N, nl_tabs(N)). + +scan_nl_white_space_fun(Cs, #erl_scan{}=St, Line, Col, Toks, Ncs) -> + scan_nl_white_space(Cs, St, Line, Col, Toks, Ncs). + +%% Note: returning {more,Cont} is meaningless here; one could just as +%% well return several tokens. But since tokens() scans up to a full +%% stop anyway, nothing is gained by not collecting all white spaces. +scan_nl_white_space([$\n|Cs], #erl_scan{has_fun = false}=St, Line, no_col=Col, + Toks0, Ncs) -> + Toks = [{white_space,anno(Line),lists:reverse(Ncs)}|Toks0], + scan_newline(Cs, St, Line+1, Col, Toks); +scan_nl_white_space([$\n|Cs], St, Line, Col, Toks, Ncs0) -> + Ncs = lists:reverse(Ncs0), + Anno = anno(Line, Col, St, ?STR(white_space, St, Ncs)), + Token = {white_space,Anno,Ncs}, + scan_newline(Cs, St, Line+1, new_column(Col, length(Ncs)), [Token|Toks]); +scan_nl_white_space([C|Cs], St, Line, Col, Toks, Ncs) + when ?WHITE_SPACE(C) -> + scan_nl_white_space(Cs, St, Line, Col, Toks, [C|Ncs]); +scan_nl_white_space([]=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_nl_white_space_fun/6}}; +scan_nl_white_space(Cs, #erl_scan{has_fun = false}=St, Line, no_col=Col, + Toks, Ncs) -> + Anno = anno(Line), + scan1(Cs, St, Line+1, Col, [{white_space,Anno,lists:reverse(Ncs)}|Toks]); +scan_nl_white_space(Cs, St, Line, Col, Toks, Ncs0) -> + Ncs = lists:reverse(Ncs0), + Anno = anno(Line, Col, St, ?STR(white_space, St, Ncs)), + Token = {white_space,Anno,Ncs}, + scan1(Cs, St, Line+1, new_column(Col, length(Ncs)), [Token|Toks]). + +newline_end(Cs, #erl_scan{has_fun = false}=St, Line, no_col=Col, + Toks, _N, Ncs) -> + scan1(Cs, St, Line+1, Col, [{white_space,anno(Line),Ncs}|Toks]); +newline_end(Cs, #erl_scan{}=St, Line, Col, Toks, N, Ncs) -> + Anno = anno(Line, Col, St, ?STR(white_space, St, Ncs)), + scan1(Cs, St, Line+1, new_column(Col, N), [{white_space,Anno,Ncs}|Toks]). + +scan_spcs_fun(Cs, #erl_scan{}=St, Line, Col, Toks, N) + when is_integer(N), N >= 1 -> + scan_spcs(Cs, St, Line, Col, Toks, N). + +scan_spcs([$\s|Cs], St, Line, Col, Toks, N) when N < 16 -> + scan_spcs(Cs, St, Line, Col, Toks, N+1); +scan_spcs([]=Cs, St, Line, Col, Toks, N) -> + {more,{Cs,St,Col,Toks,Line,N,fun scan_spcs_fun/6}}; +scan_spcs(Cs, St, Line, Col, Toks, N) -> + white_space_end(Cs, St, Line, Col, Toks, N, spcs(N)). + +scan_tabs_fun(Cs, #erl_scan{}=St, Line, Col, Toks, N) + when is_integer(N), N >= 1 -> + scan_tabs(Cs, St, Line, Col, Toks, N). + +scan_tabs([$\t|Cs], St, Line, Col, Toks, N) when N < 10 -> + scan_tabs(Cs, St, Line, Col, Toks, N+1); +scan_tabs([]=Cs, St, Line, Col, Toks, N) -> + {more,{Cs,St,Col,Toks,Line,N,fun scan_tabs_fun/6}}; +scan_tabs(Cs, St, Line, Col, Toks, N) -> + white_space_end(Cs, St, Line, Col, Toks, N, tabs(N)). + +skip_white_space_fun(Cs, #erl_scan{}=St, Line, Col, Toks, N) -> + skip_white_space(Cs, St, Line, Col, Toks, N). + +skip_white_space([$\n|Cs], St, Line, Col, Toks, _N) -> + skip_white_space(Cs, St, Line+1, new_column(Col, 1), Toks, 0); +skip_white_space([C|Cs], St, Line, Col, Toks, N) when ?WHITE_SPACE(C) -> + skip_white_space(Cs, St, Line, Col, Toks, N+1); +skip_white_space([]=Cs, St, Line, Col, Toks, N) -> + {more,{Cs,St,Col,Toks,Line,N,fun skip_white_space_fun/6}}; +skip_white_space(Cs, St, Line, Col, Toks, N) -> + scan1(Cs, St, Line, incr_column(Col, N), Toks). + +scan_white_space_fun(Cs, #erl_scan{}=St, Line, Col, Toks, Ncs) -> + scan_white_space(Cs, St, Line, Col, Toks, Ncs). + +%% Maybe \t and \s should break the loop. +scan_white_space([$\n|_]=Cs, St, Line, Col, Toks, Ncs) -> + white_space_end(Cs, St, Line, Col, Toks, length(Ncs), lists:reverse(Ncs)); +scan_white_space([C|Cs], St, Line, Col, Toks, Ncs) when ?WHITE_SPACE(C) -> + scan_white_space(Cs, St, Line, Col, Toks, [C|Ncs]); +scan_white_space([]=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_white_space_fun/6}}; +scan_white_space(Cs, St, Line, Col, Toks, Ncs) -> + white_space_end(Cs, St, Line, Col, Toks, length(Ncs), lists:reverse(Ncs)). + +-compile({inline,[white_space_end/7]}). + +white_space_end(Cs, St, Line, Col, Toks, N, Ncs) -> + tok3(Cs, St, Line, Col, Toks, white_space, Ncs, Ncs, N). + +scan_char([$\\|Cs]=Cs0, St, Line, Col, Toks) -> + case scan_escape(Cs, incr_column(Col, 2)) of + more -> + {more,{[$$|Cs0],St,Col,Toks,Line,[],fun scan/6}}; + {error,Ncs,Error,Ncol} -> + scan_error(Error, Line, Col, Line, Ncol, Ncs); + {eof,Ncol} -> + scan_error({unterminated,char}, Line, Col, Line, Ncol, eof); + {nl,Val,Str,Ncs,Ncol} -> + Anno = anno(Line, Col, St, ?STR(char, St, "$\\"++Str)), %" + Ntoks = [{char,Anno,Val}|Toks], + scan1(Ncs, St, Line+1, Ncol, Ntoks); + {Val,Str,Ncs,Ncol} -> + Anno = anno(Line, Col, St, ?STR(char, St, "$\\"++Str)), %" + Ntoks = [{char,Anno,Val}|Toks], + scan1(Ncs, St, Line, Ncol, Ntoks) + end; +scan_char([$\n=C|Cs], St, Line, Col, Toks) -> + Anno = anno(Line, Col, St, ?STR(char, St, [$$,C])), + scan1(Cs, St, Line+1, new_column(Col, 1), [{char,Anno,C}|Toks]); +scan_char([C|Cs], St, Line, Col, Toks) when ?UNICODE(C) -> + Anno = anno(Line, Col, St, ?STR(char, St, [$$,C])), + scan1(Cs, St, Line, incr_column(Col, 2), [{char,Anno,C}|Toks]); +scan_char([C|_Cs], _St, Line, Col, _Toks) when ?CHAR(C) -> + scan_error({illegal,character}, Line, Col, Line, incr_column(Col, 1), eof); +scan_char([], St, Line, Col, Toks) -> + {more,{[$$],St,Col,Toks,Line,[],fun scan/6}}; +scan_char(eof, _St, Line, Col, _Toks) -> + scan_error({unterminated,char}, Line, Col, Line, incr_column(Col, 1), eof). + +%% Sigil Prefix is scanned here and handled in scan_tqstring/6 +%% and scan_qstring/8 where non-verbatim sigils (that handle +%% character escape sequences) are enumerated. +%% Search for SigilType or sigil_type. +%% +%% The bogus SigilType = {} is used to indicate a regular +%% non-sigil string to scan_qstring/8 and scan_sigil_suffix/6. +%% +%% Sigils are also handled in erl_parse:build_sigil/3 that +%% enumerates all sigils, cause an error for unknown ones, +%% and transforms known ones. +%% +scan_sigil_prefix(Cs, St, Line, Col, Toks, Wcs) -> + case scan_name(Cs, Wcs) of + {more, Nwcs} -> + {more, {[],St,Col,Toks,Line,Nwcs,fun scan_sigil_prefix/6}}; + {Nwcs,Ncs} -> + Type = sigil_prefix, + Ncol = incr_column(Col, length(Nwcs)), + SigilCs = lists:reverse(Nwcs), + try list_to_atom(tl(SigilCs)) of + SigilType when is_atom(SigilType) -> + Anno = anno(Line, Col, St, ?STR(Type, St, SigilCs)), + Tok = {Type,Anno,SigilType}, + scan_string(Ncs, St, Line, Ncol, [Tok|Toks], SigilType) + catch _ : _ -> + scan_error({illegal,Type}, Line, Col, Line, Ncol, Ncs) + end + end. + +%% Assuming that a string starts with a delimiter right here +scan_string(Cs, St, Line, Col, Toks, SigilType) -> + case Cs of + [$",$",$"|Ncs] -> %" + scan_tqstring(Ncs, St, Line, Col, Toks, SigilType, 3); + [$",$"] -> + {more,{Cs,St,Col,Toks,Line,SigilType,fun scan_string/6}}; + [$"] -> + {more,{Cs,St,Col,Toks,Line,SigilType,fun scan_string/6}}; + [Q1|Ncs] -> + case string_right_delimiter(Q1) of + undefined -> + scan_error({illegal,string}, Line, Col, Line, Col, Cs); + Q2 when is_integer(Q2) -> + scan_qstring(Ncs, St, Line, Col, Toks, SigilType, Q1, Q2) + end; + [] -> + {more,{Cs,St,Col,Toks,Line,SigilType,fun scan_string/6}}; + eof -> + scan_error({illegal,string}, Line, Col, Line, Col, Cs) + end. + +%% String delimiters are enumerated here. +%% +%% The $" and $' delimiters are also handled in scan1/5 above +%% to recognize regular strings and quoted atoms. +%% For triple-quoted strings it is hardcoded in scan_string/6 +%% and scan_tqstring_lines/8 to look for $". +%% +-compile({inline,[string_right_delimiter/1]}). +string_right_delimiter(C) -> + case C of + $( -> $); + $[ -> $]; + ${ -> $}; + $< -> $>; + _ when + C =:= $/; + C =:= $|; + C =:= $#; + C =:= $`; + C =:= $'; %' + C =:= $" %" + -> + C; + _ -> + undefined + end. + +-record(tqs, % Triple-quoted String state + {line, % Line number of first quote character + col, % Column number - " - + sigil_type, % atom() | {} + qs, % Number of quote characters in delimiter + verbatim, % Ignore escape sequences? + qn = undefined, % Quote character counter | undefined + str = "", % Scanned text (reversed) + content_r = [], % Reverse list of reversed content lines + acc = ""}). % Current line accumulator +%% +%% Triple-quoted string (delimited by at least 3 double quote characters) +%% +%% Start delimiter may be followed only by white space. +%% +%% End delimiter may be preceeded only by white space, which defines +%% the indentation to strip from all content lines. +%% +scan_tqstring(Cs, St, Line, Col, Toks, SigilType, Qs) -> + scan_tqstring(Cs, St, Line, Col, Toks, {SigilType,Qs}). +%% +%% Scan leading $" characters until we have them all, then scan lines +scan_tqstring(Cs, St, Line, Col, Toks, {SigilType,Qs}) -> + case scan_count(Cs, $", Qs) of %" + {[],Nqs} -> + {more, + {[], St, Col, Toks, Line, {SigilType,Nqs}, + fun scan_tqstring/6}}; + {Ncs,Nqs} -> + Verbatim = + if + SigilType =:= 'b'; % binary(), non-verbatim + SigilType =:= 's' -> % string(), non-verbatim + false; + true -> % The rest are verbatim + true + end, + Tqs = + #tqs{ + line = Line, col = Col, + sigil_type = SigilType, qs = Nqs, verbatim = Verbatim, + str = lists_duplicate(Nqs, $", "") }, %" + scan_tqstring_lines(Ncs, St, Line, int_column(Col)+Qs, Toks, Tqs) + end. + +%% Scan off characters that are C and count them +scan_count([C|Cs], C, N) -> scan_count(Cs, C, N + 1); +scan_count(Cs, _, N) -> {Cs,N}. + +scan_tqstring_lines(Cs, St, Line, Col, Toks, Tqs) -> + #tqs{ qn = Qn, str = Str, content_r = ContentR, acc = Acc } = Tqs, + case + scan_tqstring_lines( + Cs, Tqs, Line, int_column(Col), Str, Qn, ContentR, Acc) + of + {ok,Ncs,Nline,Ncol,Nstr,NcontentR,IndentR} -> + %% Last line - post process the content + scan_tqstring_finish( + Ncs, St, Nline, Ncol, Toks, + Tqs#tqs{ str = Nstr, content_r = NcontentR, acc = IndentR }); + {more,Ncs,Nline,Ncol,Nstr,Nqn,NcontentR,Nacc} -> + {more, + {Ncs, St, Ncol, Toks, Nline, + Tqs#tqs{ + qn = Nqn, str = Nstr, content_r = NcontentR, acc = Nacc }, + fun scan_tqstring_lines/6}}; + {error,Ncs,Nline,Ncol,Nline_1,Ncol_1,Error} -> + Col0 = Tqs#tqs.col, + scan_error( + Error, Nline, new_column(Col0, Ncol), + Nline_1, new_column(Col0, Ncol_1), Ncs) + end. + +%% Inner loop that minimizes garbage creation +scan_tqstring_lines(Cs0, Tqs, Line, Col, Str, Qn, ContentR, Acc) -> + case Cs0 of + [$\n=C|Cs] -> + %% New line, restart searching for ending delimiter + scan_tqstring_lines( + Cs, Tqs, Line+1, 1, + [C|Str], Tqs#tqs.qs, [[C|Acc]|ContentR], []); + [C|Cs] -> + if + C =:= $", is_integer(Qn) -> %" + %% Possibly part of ending delimiter + Nstr = [C|Str], + Nacc = [C|Acc], + Ncol = Col + 1, + if + Qn =:= 1 -> + %% Complete ending delimiter + IndentR = lists:nthtail(Tqs#tqs.qs, Nacc), + {ok,Cs,Line,Col+1,Nstr,ContentR,IndentR}; + true -> + %% Collect and count this quote char + scan_tqstring_lines( + Cs, Tqs, Line, Ncol, + Nstr, Qn-1, ContentR, Nacc) + end; + ?WHITE_SPACE(C), is_integer(Qn) -> + %% White space while searching for ending delimiter + Nstr = [C|Str], + Nacc = [C|Acc], + Ncol = Col + 1, + if + Qn =:= Tqs#tqs.qs -> + %% White space before first quote char + %% - just collect + scan_tqstring_lines( + Cs, Tqs, Line, Ncol, + Nstr, Qn, ContentR, Nacc); + true -> + %% White space after too few end quote chars + %% - stop searching for end quote chars + scan_tqstring_lines( + Cs, Tqs, Line, Ncol, + Nstr, undefined, ContentR, Nacc) + end; + C =:= $\\, not Tqs#tqs.verbatim -> + case scan_escape(Cs, Col) of + more -> + {more,Cs,Line,Col,Str,Qn,ContentR,Acc}; + {error,Ncs,Error,Ncol} -> + {error,Ncs,Line,Ncol,Line,Ncol+1,Error}; + {eof,Ncol} -> + scan_tqstring_eof( + [Acc|ContentR], Tqs, Line, Ncol+1); + {nl,Val,ValStr,Ncs,Ncol} -> + %% An escaped newline: "\\\n", + %% counts as just a newline: "\n", + %% see [$\n|_] above + Nstr = lists:reverse(ValStr, [C|Str]), + scan_tqstring_lines( + Ncs, Tqs, Line+1, Ncol, + Nstr, Tqs#tqs.qs, [[Val|Acc]|ContentR], []); + {Val,ValStr,Ncs,Ncol} -> + Nstr = lists:reverse(ValStr, [C|Str]), + scan_tqstring_lines( + Ncs, Tqs, Line, Ncol+1, + Nstr, undefined, ContentR, [Val|Acc]) + end; + ?UNICODE(C) -> + %% Not searching / stop searching for ending delimiter + scan_tqstring_lines( + Cs, Tqs, Line, Col+1, + [C|Str], undefined, ContentR, [C|Acc]); + ?CHAR(C) -> + %% Illegal Unicode character + {error,Cs,Line,Col,Line,Col+1,{illegal,character}} + end; + [] -> + {more,Cs0,Line,Col,Str,Qn,ContentR,Acc}; + eof -> + scan_tqstring_eof([Acc|ContentR], Tqs, Line, Col) + end. + +scan_tqstring_eof(ContentR, Tqs, Line, Col) -> + #tqs{ line = Line0, col = Col0, sigil_type = SigilType, qs = Qs } = Tqs, + {error,eof,Line0,incr_column(Col0, Qs),Line,Col, + {unterminated, + if + SigilType =:= ?NO_SIGIL -> + {string,Qs}; + is_atom(SigilType) -> + {sigil,SigilType,Qs} + end, + string_head(ContentR)}}. + +%% Strip last line newline, +%% get indentation definition from last line, +%% strip indentation from content lines, +%% check white space on first line after start delimiter, +%% create the string token, +%% done +%% +scan_tqstring_finish(Cs, St, Line, Col, Toks, Tqs) -> + %% IndentR :: Indentation characters, reversed + %% + #tqs{ + line = Line0, col = Col0, + content_r = ContentR, acc = IndentR } = Tqs, + NcontentR = strip_last_line_newline_r(ContentR), + %% + %% NcontentR is now the string's content lines + %% including the characters after the opening quote sequence + %% (the 0:th line?), in reversed order, each line reversed. + %% Newline has been stripped from the last content line, + %% all others has newline characters as from the input. + case + tqstring_finish(lists:reverse(IndentR), NcontentR, Line-1) + of + Content when is_list(Content) -> + #tqs{ str = Str, sigil_type = SigilType } = Tqs, + AnnoStr = ?STR(string, St, Text, lists:reverse(Str)), + Tok = {string,anno(Line0, Col0, St, AnnoStr),Content}, + scan_sigil_suffix( + Cs, St, Line, new_column(Col0, Col), [Tok|Toks], SigilType); + {Tag=indentation, ErrorLine, ErrorCol} -> + scan_error( + Tag, ErrorLine, new_column(Col0, ErrorCol), + Line, new_column(Col0, Col), Cs); + {Tag=white_space, N} -> + scan_error( + Tag, Line0, incr_column(Col0, Tqs#tqs.qs+N), + Line, new_column(Col0, Col), Cs) + end. + +%% Strip newline from the last line, but not if it is the only line +%% +strip_last_line_newline_r(ContentR=[_]) -> + ContentR; +strip_last_line_newline_r([LastLineR|ContentR]) -> + [strip_newline_r(LastLineR)|ContentR]. + +strip_newline_r("\n\r"++Rcs) -> Rcs; +strip_newline_r("\n"++Rcs) -> Rcs. + +%% Loop from last to first line and remember the last error, +%% so the last error that is found will be the one reported, +%% that is: the first in the string. +%% +%% Build the string content one line at the time by first +%% prepending the line to Content and then spripping +%% the defined indentation. +%% +%% For the first (0:th) line, strip the newline and then +%% check that it contains only white space +%% +tqstring_finish(Indent, ContentR, Line) -> + tqstring_finish(Indent, ContentR, Line, undefined, ""). +%% +tqstring_finish(_Indent, [FirstLineR], _Line, Error, Content) -> + NfirstLineR = strip_newline_r(FirstLineR), + FirstLine = lists:reverse(NfirstLineR), + %% First line; check that it is all white space + case check_white_space(FirstLine) of + ok -> + if + Error =:= undefined -> + Content; + true -> + Error + end; + N -> + {white_space, N} + end; +tqstring_finish( + Indent, [StringR|StringsR], Line, Error, Content) -> + case strip_indent(Indent, lists:reverse(StringR, Content)) of + Ncontent when is_list(Ncontent) -> + tqstring_finish(Indent, StringsR, Line-1, Error, Ncontent); + ErrorCol when is_integer(ErrorCol) -> + Nerror = {indentation, Line, ErrorCol}, + tqstring_finish(Indent, StringsR, Line-1, Nerror, "") + end. + +%% Strip the defined indentation from the string content +%% +strip_indent(Indent, Cs) -> + case Cs of + %% Allow empty content lines to have no indentation + "\r\n"++_ -> Cs; + "\n"++_ -> Cs; + "" -> Cs; % The last newline is stripped + _ -> + strip_indent(Indent, Cs, 1) + end. +%% +strip_indent([C|Indent], [C|Cs], Col) -> + strip_indent(Indent, Cs, Col+1); % Strip +strip_indent([], Cs, _) -> Cs; % Done +strip_indent(_, _, Col) -> Col. % Incorrect indentation + +%% Check that all characters are white space and return 'ok', +%% or return the number of white space characters +check_white_space(Cs) -> + check_white_space(Cs, 0). +%% +check_white_space([], _) -> + ok; +check_white_space([C|Cs], N) -> + if + ?WHITE_SPACE(C) -> + check_white_space(Cs, N+1); + true -> + N + end. + + +-record(qstring, + { line, col, sigil_type, q1, q2, str = undefined, wcs = "" }). +%% +%% Quoted string +%% +scan_qstring(Cs, St, Line, Col, Toks, SigilType, Q1, Q2) -> + if + SigilType =:= ?NO_SIGIL; + SigilType =:= ''; % The vanilla (default) sigil + SigilType =:= 'b'; % binary() with escape sequences + SigilType =:= 's' -> % string() with escape sequences + Ncol = incr_column(Col, 1), % Quote character + scan_qstring( + Cs, St, Line, Ncol, Toks, + #qstring{ + line = Line, col = Col, + sigil_type = SigilType, q1 = Q1, q2 = Q2 }); + true -> % Verbatim string + scan_vstring(Cs, St, Line, Col, Toks, SigilType, Q1, Q2) + end. + +scan_qstring( + Cs, #erl_scan{}=St, Line, Col, Toks, + #qstring{ q2 = Q2, str = Str, wcs = Wcs } = Qstring) -> + case scan_string0(Cs, St, Line, Col, Q2, Str, Wcs) of + {ok, {Ncs,Nline,Ncol,Nstr,Nwcs}} -> + #qstring{ + line = Line0, col = Col0, + sigil_type = SigilType, q1 = Q1 } = Qstring, + AnnoStr = + ?STR(string, St, Text, [Q1|lists:reverse(Nstr, [Q2])]), + Anno = anno(Line0, Col0, St, AnnoStr), + Tok = {string,Anno,lists:reverse(Nwcs)}, + scan_sigil_suffix(Ncs, St, Nline, Ncol, [Tok|Toks], SigilType); + {more, {Ncs,Nline,Ncol,Nstr,Nwcs}} -> + Nqstring = Qstring#qstring{ str = Nstr, wcs = Nwcs }, + {more, {Ncs,St,Ncol,Toks,Nline,Nqstring,fun scan_qstring/6}}; + {eof, {Ncs,Nline,Ncol,Nwcs}} -> + #qstring{ + line = Line0, col = Col0, + sigil_type = SigilType, q1 = Q1 } = Qstring, + scan_error( + {unterminated, + if + SigilType =:= ?NO_SIGIL -> + string; + is_atom(SigilType) -> + {sigil,SigilType,Q1,Q2} + end, + string_head([Nwcs])}, + Line0, incr_column(Col0, 1), Nline, Ncol, Ncs); + {{error,_,_}, _Ncs} = Error -> + Error + end. + +scan_sigil_suffix(Cs, St, Line, Col, Toks, ?NO_SIGIL) -> + scan_string_concat(Cs, St, Line, Col, Toks, ""); +scan_sigil_suffix(Cs, St, Line, Col, Toks, SigilType) + when is_atom(SigilType) -> % Sigil string - scan suffix + scan_sigil_suffix(Cs, St, Line, Col, Toks, ""); +%% +scan_sigil_suffix(Cs, St, Line, Col, Toks, Wcs) when is_list(Wcs) -> + case scan_name(Cs, Wcs) of + {more, Nwcs} -> + {more, {[],St,Col,Toks,Line,Nwcs,fun scan_sigil_suffix/6}}; + {Nwcs,Ncs} -> + Type = sigil_suffix, + Ncol = incr_column(Col, length(Nwcs)), + Suffix = lists:reverse(Nwcs), + try list_to_atom(Suffix) of + A when is_atom(A) -> + Anno = anno(Line, Col, St, ?STR(Type, St, Suffix)), + Tok = {Type,Anno,Suffix}, + scan_string_concat( + Ncs, St, Line, Ncol, [Tok|Toks], Suffix) + catch _ : _ -> + scan_error({illegal,Type}, Line, Col, Line, Ncol, Ncs) + end + end. + +scan_string_concat(Cs, St, Line, Col, Toks, "" = SigilSuffix) -> + case Cs of + [$"|_] -> + scan_error(string_concat, Line, Col, Line, Col, Cs); + [] -> + {more, + {Cs,St,Col,Toks,Line,SigilSuffix,fun scan_string_concat/6}}; + _ -> + scan1(Cs, St, Line, Col, Toks) + end; +scan_string_concat(Cs, St, Line, Col, Toks, SigilSuffix) + when is_list(SigilSuffix) -> + scan1(Cs, St, Line, Col, Toks). + + +-record(vstring, + { line, col, sigil_type, q1, q2, wcs = "" }). +%% +%% Verbatim quoted string (not triple-quoted) +%% +scan_vstring(Cs, St, Line, Col, Toks, SigilType, Q1, Q2) -> + Vstring = + #vstring{ + line = Line, col = Col, + sigil_type = SigilType, q1 = Q1, q2 = Q2 }, + scan_vstring(Cs, St, Line, incr_column(Col, 1), Toks, Vstring). + +scan_vstring( + Cs, #erl_scan{}=St, Line, Col, Toks, + #vstring{ q2 = Q2, wcs = Wcs } = Vstring) -> + case scan_vstring(Cs, Q2, Line, Col, Wcs) of + {ok, {Ncs,Nline,Ncol,Nwcs}} -> + #vstring{ + line = Line0, col = Col0, + sigil_type = SigilType, q1 = Q1 } = Vstring, + AnnoStr = + ?STR(string, St, Text, [Q1|lists:reverse(Nwcs, [Q2])]), + Anno = anno(Line0, Col0, St, AnnoStr), + Tok = {string,Anno,lists:reverse(Nwcs)}, + scan_sigil_suffix(Ncs, St, Nline, Ncol, [Tok|Toks], SigilType); + {more, {Ncs,Nline,Ncol,Nwcs}} -> + Nvstring = Vstring#vstring{ wcs = Nwcs }, + {more, {Ncs,St,Ncol,Toks,Nline,Nvstring,fun scan_vstring/6}}; + {eof, {Ncs,Nline,Ncol,Nwcs}} -> + #vstring{ + line = Line0, col = Col0, + sigil_type = SigilType, q1 = Q1 } = Vstring, + scan_error( + {unterminated, + if + SigilType =:= ?NO_SIGIL -> + string; + is_atom(SigilType) -> + {sigil,SigilType,Q1,Q2} + end, + string_head([Nwcs])}, + Line0, incr_column(Col0, 1), Nline, Ncol, Ncs); + {{error,_,_}, _Ncs} = Error -> + Error + end. + +scan_vstring(Cs, Q, Line, no_col, Wcs) -> + scan_vstring(Cs, Q, Line, Wcs); +scan_vstring(Cs, Q, Line, Col, Wcs) -> + case Cs of + [Q|Ncs] -> + {ok, {Ncs,Line,Col+1,Wcs}}; + [$\n=C|Ncs] -> + scan_vstring(Ncs, Q, Line+1, 1, [C|Wcs]); + [C|Ncs] when ?UNICODE(C) -> + scan_vstring(Ncs, Q, Line, Col+1, [C|Wcs]); + [C|Ncs] when ?CHAR(C) -> + scan_error( + {illegal,character}, Line, Col, Line, Col+1, Ncs); + [] -> + {more, {Cs,Line,Col,Wcs}}; + eof -> + {eof, {Cs,Line,Col,Wcs}} + end. +%% +%% Duplicated code that optimizes for Col = no_col, which avoids +%% both testing for no_col before every incrementation, +%% and the incrementation itself +%% +scan_vstring(Cs, Q, Line, Wcs) -> + case Cs of + [Q|Ncs] -> + {ok, {Ncs,Line,no_col,Wcs}}; + [$\n=C|Ncs] -> + scan_vstring(Ncs, Q, Line+1, [C|Wcs]); + [C|Ncs] when ?UNICODE(C) -> + scan_vstring(Ncs, Q, Line, [C|Wcs]); + [C|Ncs] when ?CHAR(C) -> + Col = no_col, + scan_error( + {illegal,character}, Line, Col, Line, Col, Ncs); + [] -> + {more, {Cs,Line,no_col,Wcs}}; + eof -> + {eof, {Cs,Line,no_col,Wcs}} + end. + + +-record(qatom, { line, col, str = undefined, wcs = "" }). + +scan_qatom(Cs, St, Line, Col, Toks) -> + Qatom = #qatom{ line = Line, col = Col }, + scan_qatom(Cs, St, Line, incr_column(Col, 1), Toks, Qatom). + +scan_qatom( + Cs, #erl_scan{}=St, Line, Col, Toks, + #qatom{ str = Str, wcs = Wcs } = Qatom) -> + C = $', %' + case scan_string0(Cs, St, Line, Col, C, Str, Wcs) of + {ok, {Ncs,Nline,Ncol,Nstr,Nwcs}} -> + #qatom{ line = Line0, col = Col0 } = Qatom, + try list_to_atom(lists:reverse(Nwcs)) of + A when is_atom(A) -> + AnnoStr = + ?STR(atom, St, Text, [C|lists:reverse(Nstr, [C])]), + Anno = anno(Line0, Col0, St, AnnoStr), + Tok = {atom,Anno,A}, + scan1(Ncs, St, Nline, Ncol, [Tok|Toks]) + catch _ : _ -> + scan_error( + {illegal,atom}, Line0, Col0, Nline, Ncol, Ncs) + end; + {more, {Ncs,Nline,Ncol,Nstr,Nwcs}} -> + Nqatom = Qatom#qatom{ str = Nstr, wcs = Nwcs }, + {more, {Ncs,St,Ncol,Toks,Nline,Nqatom,fun scan_qatom/6}}; + {eof, {Ncs,Nline,Ncol,Nwcs}} -> + #qatom{ line = Line0, col = Col0 } = Qatom, + scan_error( + {unterminated,atom,string_head([Nwcs])}, + Line0, incr_column(Col0, 1), Nline, Ncol, Ncs); + {{error,_,_}, _Ncs} = Error -> + Error + end. + +string_head(ContentR) -> + string:slice(lists_foldl_reverse(ContentR, ""), 0, 16). + + +scan_string0(Cs, #erl_scan{has_fun=true}, Line, Col, Q, Str, Wcs) + when Str =/= undefined -> + scan_string1(Cs, Line, Col, Q, Str, Wcs); +scan_string0(Cs, #erl_scan{}, Line, Col, Q, _Str, Wcs) -> + if + Col =:= no_col -> + scan_string_no_col(Cs, Line, Q, Wcs); + true -> + scan_string_col(Cs, Line, Col, Q, Wcs) + end. + +%% Optimization. Col =:= no_col, non-escaped characters +scan_string_no_col([Q|Cs], Line, Q, Wcs) -> + {ok, {Cs,Line,no_col,Wcs,Wcs}}; +scan_string_no_col([$\n=C|Cs], Line, Q, Wcs) -> + scan_string_no_col(Cs, Line+1, Q, [C|Wcs]); +scan_string_no_col([C|Cs], Line, Q, Wcs) when C =/= $\\, ?UNICODE(C) -> + scan_string_no_col(Cs, Line, Q, [C|Wcs]); +scan_string_no_col(Cs, Line, Q, Wcs) -> + scan_string1(Cs, Line, no_col, Q, Wcs, Wcs). + +%% Optimization. Col =/= no_col, non-escaped characters +scan_string_col([Q|Cs], Line, Col, Q, Wcs) -> + {ok, {Cs,Line,Col+1,Wcs,Wcs}}; +scan_string_col([$\n=C|Cs], Line, _Col, Q, Wcs) -> + scan_string_col(Cs, Line+1, 1, Q, [C|Wcs]); +scan_string_col([C|Cs], Line, Col, Q, Wcs) when C =/= $\\, ?UNICODE(C) -> + scan_string_col(Cs, Line, Col+1, Q, [C|Wcs]); +scan_string_col(Cs, Line, Col, Q, Wcs) -> + scan_string1(Cs, Line, Col, Q, Wcs, Wcs). + +%% Note: in those cases when a 'char_error' tuple is returned below it +%% is tempting to skip over characters up to the first Q character, +%% but then the end location of the error tuple would not correspond +%% to the start location of the returned Rest string. (Maybe the end +%% location could be modified, but that too is ugly.) +scan_string1([Q|Cs], Line, Col, Q, Str, Wcs) -> + {ok, {Cs,Line,incr_column(Col, 1),Str,Wcs}}; +scan_string1([$\n=C|Cs], Line, Col, Q, Str, Wcs) -> + Ncol = new_column(Col, 1), + scan_string1(Cs, Line+1, Ncol, Q, [C|Str], [C|Wcs]); +scan_string1([$\\|Cs]=Cs0, Line, Col, Q, Str, Wcs) -> + case scan_escape(Cs, Col) of + more -> + {more, {Cs0,Line,Col,Str,Wcs}}; + {error,Ncs,Error,Ncol} -> + scan_error(Error, Line, Col, Line, incr_column(Ncol, 1), Ncs); + {eof=Ncs,Ncol} -> + {eof, {Ncs,Line,incr_column(Ncol, 1),lists:reverse(Wcs)}}; + {nl,Val,ValStr,Ncs,Ncol} -> + Nstr = lists:reverse(ValStr, [$\\|Str]), + Nwcs = [Val|Wcs], + scan_string1(Ncs, Line+1, Ncol, Q, Nstr, Nwcs); + {Val,ValStr,Ncs,Ncol} -> + Nstr = lists:reverse(ValStr, [$\\|Str]), + Nwcs = [Val|Wcs], + scan_string1(Ncs, Line, incr_column(Ncol, 1), Q, Nstr, Nwcs) + end; +scan_string1([C|Cs], Line, no_col=Col, Q, Str, Wcs) when ?UNICODE(C) -> + scan_string1(Cs, Line, Col, Q, [C|Str], [C|Wcs]); +scan_string1([C|Cs], Line, Col, Q, Str, Wcs) when ?UNICODE(C) -> + scan_string1(Cs, Line, Col+1, Q, [C|Str], [C|Wcs]); +scan_string1([C|Cs], Line, Col, _Q, _Str, _Wcs) when ?CHAR(C) -> + scan_error({illegal,character}, Line, Col, Line, incr_column(Col, 1), Cs); +scan_string1([]=Cs, Line, Col, _Q, Str, Wcs) -> + {more, {Cs,Line,Col,Str,Wcs}}; +scan_string1(eof=Cs, Line, Col, _Q, _Str, Wcs) -> + {eof, {Cs,Line,Col,Wcs}}. + +-define(OCT(C), (is_integer(C) andalso $0 =< C andalso C =< $7)). +-define(HEX(C), (is_integer(C) andalso + (C >= $0 andalso C =< $9 orelse + C >= $A andalso C =< $F orelse + C >= $a andalso C =< $f))). + +%% \<1-3> octal digits +scan_escape([O1,O2,O3|Cs], Col) when ?OCT(O1), ?OCT(O2), ?OCT(O3) -> + Val = (O1*8 + O2)*8 + O3 - 73*$0, + {Val,[O1,O2,O3],Cs,incr_column(Col, 3)}; +scan_escape([O1,O2], _Col) when ?OCT(O1), ?OCT(O2) -> + more; +scan_escape([O1,O2|Cs], Col) when ?OCT(O1), ?OCT(O2) -> + Val = (O1*8 + O2) - 9*$0, + {Val,[O1,O2],Cs,incr_column(Col, 2)}; +scan_escape([O1], _Col) when ?OCT(O1) -> + more; +scan_escape([O1|Cs], Col) when ?OCT(O1) -> + {O1 - $0,[O1],Cs,incr_column(Col, 1)}; +%% \x{} +scan_escape([$x,${|Cs], Col) -> + scan_hex(Cs, incr_column(Col, 2), []); +scan_escape([$x], _Col) -> + more; +scan_escape([$x|eof], Col) -> + {eof,incr_column(Col, 1)}; +%% \x<2> hexadecimal digits +scan_escape([$x,H1,H2|Cs], Col) when ?HEX(H1), ?HEX(H2) -> + Val = erlang:list_to_integer([H1,H2], 16), + {Val,[$x,H1,H2],Cs,incr_column(Col, 3)}; +scan_escape([$x,H1], _Col) when ?HEX(H1) -> + more; +scan_escape([$x|Cs], Col) -> + {error,Cs,{illegal,character},incr_column(Col, 1)}; +%% \^X -> Ctrl+X +scan_escape([$^=C0,C|Cs], Col) when ?CHAR(C) -> + case caret_char_code(C) of + error -> + {error,[C|Cs],{illegal,character},incr_column(Col, 1)}; + Code -> + {Code,[C0,C],Cs,incr_column(Col, 2)} + end; +scan_escape([$^], _Col) -> + more; +scan_escape([$^|eof], Col) -> + {eof,incr_column(Col, 1)}; +scan_escape([$\n=C|Cs], Col) -> + {nl,C,[C],Cs,new_column(Col, 1)}; +scan_escape([C0|Cs], Col) when ?UNICODE(C0) -> + C = escape_char(C0), + {C,[C0],Cs,incr_column(Col, 1)}; +scan_escape([C|Cs], Col) when ?CHAR(C) -> + {error,Cs,{illegal,character},incr_column(Col, 1)}; +scan_escape([], _Col) -> + more; +scan_escape(eof, Col) -> + {eof,Col}. + +scan_hex([C|Cs], Col, Wcs) when ?HEX(C) -> + scan_hex(Cs, incr_column(Col, 1), [C|Wcs]); +scan_hex(Cs, Col, Wcs) -> + scan_hex_end(Cs, Col, Wcs, "x{"). + +scan_hex_end([$}|Cs], Col, [], _Str) -> + %% Empty escape sequence. + {error,Cs,{illegal,character},incr_column(Col, 1)}; +scan_hex_end([$}|Cs], Col, Wcs0, Str0) -> + Wcs = lists:reverse(Wcs0), + try list_to_integer(Wcs, 16) of + Val when ?UNICODE(Val) -> + {Val,Str0++Wcs++[$}],Cs,incr_column(Col, 1)}; + _Val -> + {error,Cs,{illegal,character},incr_column(Col, 1)} + catch + error:system_limit -> + %% Extremely unlikely to occur in practice. + {error,Cs,{illegal,character},incr_column(Col, 1)} + end; +scan_hex_end([], _Col, _Wcs, _Str0) -> + more; +scan_hex_end(eof, Col, _Wcs, _Str0) -> + {eof,Col}; +scan_hex_end(Cs, Col, _Wcs, _Str0) -> + {error,Cs,{illegal,character},Col}. + +escape_char($n) -> $\n; % \n = LF +escape_char($r) -> $\r; % \r = CR +escape_char($t) -> $\t; % \t = TAB +escape_char($v) -> $\v; % \v = VT +escape_char($b) -> $\b; % \b = BS +escape_char($f) -> $\f; % \f = FF +escape_char($e) -> $\e; % \e = ESC +escape_char($s) -> $\s; % \s = SPC +escape_char($d) -> $\d; % \d = DEL +escape_char(C) -> C. + +caret_char_code($?) -> 16#7f; +caret_char_code(C) when $@ =< C, C =< $_; $a =< C, C =< $z -> C band 16#1f; +caret_char_code(_) -> error. + +scan_number(Cs, #erl_scan{}=St, Line, Col, Toks, {Ncs, Us}) -> + scan_number(Cs, St, Line, Col, Toks, Ncs, Us). + +scan_number([C|Cs], St, Line, Col, Toks, Ncs, Us) when ?DIGIT(C) -> + scan_number(Cs, St, Line, Col, Toks, [C|Ncs], Us); +scan_number([$_,Next|Cs], St, Line, Col, Toks, [Prev|_]=Ncs, _Us) when + ?DIGIT(Next) andalso ?DIGIT(Prev) -> + scan_number(Cs, St, Line, Col, Toks, [Next,$_|Ncs], with_underscore); +scan_number([$_]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_number/6}}; +scan_number([$.,C|Cs], St, Line, Col, Toks, Ncs, Us) when ?DIGIT(C) -> + scan_fraction(Cs, St, Line, Col, Toks, [C,$.|Ncs], Us); +scan_number([$.]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_number/6}}; +scan_number([$#|Cs]=Cs0, St, Line, Col, Toks, Ncs0, Us) -> + Ncs = lists:reverse(Ncs0), + try list_to_integer(remove_digit_separators(Ncs, Us)) of + B when is_integer(B), 2 =< B, B =< 1+$Z-$A+10 -> + Bcs = Ncs++[$#], + scan_based_int(Cs, St, Line, Col, Toks, B, [], Bcs, no_underscore); + B when is_integer(B) -> + Len = length(Ncs), + scan_error({base,B}, Line, Col, Line, incr_column(Col, Len), Cs0) + catch + error:system_limit -> + %% Extremely unlikely to occur in practice. + scan_error({illegal,base}, Line, Col, Line, Col, Cs0) + end; +scan_number([]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_number/6}}; +scan_number(Cs, St, Line, Col, Toks, Ncs0, Us) -> + Ncs = lists:reverse(Ncs0), + try list_to_integer(remove_digit_separators(Ncs, Us), 10) of + N -> + tok3(Cs, St, Line, Col, Toks, integer, Ncs, N) + catch + error:system_limit -> + %% Extremely unlikely to occur in practice. + Ncol = incr_column(Col, length(Ncs)), + scan_error({illegal,integer}, Line, Col, Line, Ncol, Cs) + end. + +remove_digit_separators(Number, no_underscore) -> + Number; +remove_digit_separators(Number, with_underscore) -> + [C || C <- Number, C =/= $_]. + +-define(BASED_DIGIT(C, B), + (is_integer(C) + andalso + ((?DIGIT(C) andalso C < $0 + B) + orelse (C >= $A andalso B > 10 andalso C < $A + B - 10) + orelse (C >= $a andalso B > 10 andalso C < $a + B - 10)))). + +scan_based_int(Cs, #erl_scan{}=St, Line, Col, Toks, {B,NCs,BCs,Us}) + when is_integer(B), 2 =< B, B =< 1+$Z-$A+10 -> + scan_based_int(Cs, St, Line, Col, Toks, B, NCs, BCs, Us). + +scan_based_int([C|Cs], St, Line, Col, Toks, B, Ncs, Bcs, Us) when + ?BASED_DIGIT(C, B) -> + scan_based_int(Cs, St, Line, Col, Toks, B, [C|Ncs], Bcs, Us); +scan_based_int([$_,Next|Cs], St, Line, Col, Toks, B, [Prev|_]=Ncs, Bcs, _Us) + when ?BASED_DIGIT(Next, B) andalso ?BASED_DIGIT(Prev, B) -> + scan_based_int(Cs, St, Line, Col, Toks, B, [Next,$_|Ncs], Bcs, + with_underscore); +scan_based_int([$_]=Cs, St, Line, Col, Toks, B, NCs, BCs, Us) -> + {more,{Cs,St,Col,Toks,Line,{B,NCs,BCs,Us},fun scan_based_int/6}}; +scan_based_int([]=Cs, St, Line, Col, Toks, B, NCs, BCs, Us) -> + {more,{Cs,St,Col,Toks,Line,{B,NCs,BCs,Us},fun scan_based_int/6}}; +scan_based_int(Cs, _St, Line, Col, _Toks, _B, [], Bcs, _Us) -> + %% No actual digits following the base. + Len = length(Bcs), + Ncol = incr_column(Col, Len), + scan_error({illegal,integer}, Line, Col, Line, Ncol, Cs); +scan_based_int(Cs, St, Line, Col, Toks, B, Ncs0, [_|_]=Bcs, Us) -> + Ncs = lists:reverse(Ncs0), + try list_to_integer(remove_digit_separators(Ncs, Us), B) of + N -> + tok3(Cs, St, Line, Col, Toks, integer, Bcs++Ncs, N) + catch + error:system_limit -> + %% Extremely unlikely to occur in practice. + Len = length(Bcs)+length(Ncs), + Ncol = incr_column(Col, Len), + scan_error({illegal,integer}, Line, Col, Line, Ncol, Cs) + end. + +scan_fraction(Cs, #erl_scan{}=St, Line, Col, Toks, {Ncs,Us}) -> + scan_fraction(Cs, St, Line, Col, Toks, Ncs, Us). + +scan_fraction([C|Cs], St, Line, Col, Toks, Ncs, Us) when ?DIGIT(C) -> + scan_fraction(Cs, St, Line, Col, Toks, [C|Ncs], Us); +scan_fraction([$_,Next|Cs], St, Line, Col, Toks, [Prev|_]=Ncs, _Us) when + ?DIGIT(Next) andalso ?DIGIT(Prev) -> + scan_fraction(Cs, St, Line, Col, Toks, [Next,$_|Ncs], with_underscore); +scan_fraction([$_]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_fraction/6}}; +scan_fraction([E|Cs], St, Line, Col, Toks, Ncs, Us) when E =:= $e; E =:= $E -> + scan_exponent_sign(Cs, St, Line, Col, Toks, [E|Ncs], Us); +scan_fraction([]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_fraction/6}}; +scan_fraction(Cs, St, Line, Col, Toks, Ncs, Us) -> + float_end(Cs, St, Line, Col, Toks, Ncs, Us). + +scan_exponent_sign(Cs, #erl_scan{}=St, Line, Col, Toks, {Ncs, Us}) -> + scan_exponent_sign(Cs, St, Line, Col, Toks, Ncs, Us). + +scan_exponent_sign([C|Cs], St, Line, Col, Toks, Ncs, Us) when + C =:= $+; C =:= $- -> + scan_exponent(Cs, St, Line, Col, Toks, [C|Ncs], Us); +scan_exponent_sign([]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_exponent_sign/6}}; +scan_exponent_sign(Cs, St, Line, Col, Toks, Ncs, Us) -> + scan_exponent(Cs, St, Line, Col, Toks, Ncs, Us). + +scan_exponent(Cs, #erl_scan{}=St, Line, Col, Toks, {Ncs, Us}) -> + scan_exponent(Cs, St, Line, Col, Toks, Ncs, Us). + +scan_exponent([C|Cs], St, Line, Col, Toks, Ncs, Us) when ?DIGIT(C) -> + scan_exponent(Cs, St, Line, Col, Toks, [C|Ncs], Us); +scan_exponent([$_,Next|Cs], St, Line, Col, Toks, [Prev|_]=Ncs, _) when + ?DIGIT(Next) andalso ?DIGIT(Prev) -> + scan_exponent(Cs, St, Line, Col, Toks, [Next,$_|Ncs], with_underscore); +scan_exponent([$_]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_exponent/6}}; +scan_exponent([]=Cs, St, Line, Col, Toks, Ncs, Us) -> + {more,{Cs,St,Col,Toks,Line,{Ncs,Us},fun scan_exponent/6}}; +scan_exponent(Cs, St, Line, Col, Toks, Ncs, Us) -> + float_end(Cs, St, Line, Col, Toks, Ncs, Us). + +float_end(Cs, St, Line, Col, Toks, Ncs0, Us) -> + Ncs = lists:reverse(Ncs0), + try list_to_float(remove_digit_separators(Ncs, Us)) of + F -> + tok3(Cs, St, Line, Col, Toks, float, Ncs, F) + catch + _:_ -> + Ncol = incr_column(Col, length(Ncs)), + scan_error({illegal,float}, Line, Col, Line, Ncol, Cs) + end. + +skip_comment_fun(Cs, #erl_scan{}=St, Line, Col, Toks, N) -> + skip_comment(Cs, St, Line, Col, Toks, N). + +skip_comment([C|Cs], St, Line, Col, Toks, N) when C =/= $\n, ?CHAR(C) -> + case ?UNICODE(C) of + true -> + skip_comment(Cs, St, Line, Col, Toks, N+1); + false -> + Ncol = incr_column(Col, N+1), + scan_error({illegal,character}, Line, Col, Line, Ncol, Cs) + end; +skip_comment([]=Cs, St, Line, Col, Toks, N) -> + {more,{Cs,St,Col,Toks,Line,N,fun skip_comment_fun/6}}; +skip_comment(Cs, St, Line, Col, Toks, N) -> + scan1(Cs, St, Line, incr_column(Col, N), Toks). + +scan_comment_fun(Cs, #erl_scan{}=St, Line, Col, Toks, Ncs) -> + scan_comment(Cs, St, Line, Col, Toks, Ncs). + +scan_comment([C|Cs], St, Line, Col, Toks, Ncs) + when C =/= $\n, ?CHAR(C) -> + case ?UNICODE(C) of + true -> + scan_comment(Cs, St, Line, Col, Toks, [C|Ncs]); + false -> + Ncol = incr_column(Col, length(Ncs)+1), + scan_error({illegal,character}, Line, Col, Line, Ncol, Cs) + end; +scan_comment([]=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_comment_fun/6}}; +scan_comment(Cs, St, Line, Col, Toks, Ncs0) -> + Ncs = lists:reverse(Ncs0), + tok3(Cs, St, Line, Col, Toks, comment, Ncs, Ncs). + +scan_check("%%ssa%" ++ Cs, St, Line, Col, Toks, _Ncs) -> + scan_check1(Cs, St, Line, Toks, Col, 7); +scan_check("%%ssa"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%%ss"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%%s"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%%"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%ssa%" ++ Cs, St, Line, Col, Toks, _Ncs) -> + scan_check1(Cs, St, Line, Toks, Col, 6); +scan_check("%ssa"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%ss"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("%s"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("ssa%" ++ Cs, St, Line, Col, Toks, _Ncs) -> + scan_check1(Cs, St, Line, Toks, Col, 5); +scan_check("ssa"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("ss"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check("s"=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check([]=Cs, St, Line, Col, Toks, Ncs) -> + {more,{Cs,St,Col,Toks,Line,Ncs,fun scan_check/6}}; +scan_check(Cs, St=#erl_scan{comment=true}, Line, Col, Toks, Ncs) -> + scan_comment(Cs, St, Line, Col, Toks, Ncs); +scan_check(Cs, St, Line, Col, Toks, _Ncs) -> + skip_comment(Cs, St, Line, Col, Toks, 1). + +scan_check1(Cs, St=#erl_scan{in_check=true}, Line, Toks, Col, NoofCols) -> + %% Skip as we are already in the check mode + scan1(Cs, St, Line, incr_column(Col, NoofCols), Toks); +scan_check1(Cs, St, Line, Toks, Col, NoofCols) -> + tok2(Cs, St#erl_scan{in_check=true}, Line, + Col, Toks, "%ssa%", '%ssa%', NoofCols). + +tok2(Cs, #erl_scan{has_fun = false}=St, Line, no_col=Col, Toks, _Wcs, P) -> + scan1(Cs, St, Line, Col, [{P,anno(Line)}|Toks]); +tok2(Cs, #erl_scan{}=St, Line, Col, Toks, Wcs, P) -> + Anno = anno(Line, Col, St, ?STR(P, St, Wcs)), + scan1(Cs, St, Line, incr_column(Col, length(Wcs)), [{P,Anno}|Toks]). + +tok2(Cs, #erl_scan{has_fun = false}=St, Line, no_col=Col, Toks, _Wcs, P, _N) -> + scan1(Cs, St, Line, Col, [{P,anno(Line)}|Toks]); +tok2(Cs, #erl_scan{}=St, Line, Col, Toks, Wcs, P, N) -> + Anno = anno(Line, Col, St, ?STR(P,St,Wcs)), + scan1(Cs, St, Line, incr_column(Col, N), [{P,Anno}|Toks]). + +tok3(Cs, #erl_scan{has_fun = false}=St, Line, no_col=Col, Toks, Item, _S, Sym) -> + scan1(Cs, St, Line, Col, [{Item,anno(Line),Sym}|Toks]); +tok3(Cs, #erl_scan{}=St, Line, Col, Toks, Item, String, Sym) -> + Token = {Item,anno(Line, Col, St, ?STR(Item, St, String)),Sym}, + scan1(Cs, St, Line, incr_column(Col, length(String)), [Token|Toks]). + +tok3(Cs, #erl_scan{has_fun = false}=St, Line, no_col=Col, Toks, Item, + _String, Sym, _Length) -> + scan1(Cs, St, Line, Col, [{Item,anno(Line),Sym}|Toks]); +tok3(Cs, #erl_scan{}=St, Line, Col, Toks, Item, String, Sym, Length) -> + Token = {Item,anno(Line, Col, St, ?STR(Item, St, String)),Sym}, + scan1(Cs, St, Line, incr_column(Col, Length), [Token|Toks]). + +scan_error(Error, Line, Col, EndLine, EndCol, Rest) -> + Loc = location(Line, Col), + EndLoc = location(EndLine, EndCol), + scan_error(Error, Loc, EndLoc, Rest). + +scan_error(Error, ErrorLoc, EndLoc, Rest) -> + {{error,{ErrorLoc,?MODULE,Error},EndLoc},Rest}. + +-compile({inline,[anno/4]}). + +anno(Line, no_col, #erl_scan{has_fun = false}, _String) -> + anno(Line); +anno(Line, no_col, #erl_scan{has_fun = true}, []) -> + anno(Line); +anno(Line, no_col, #erl_scan{has_fun = true}, String) -> + Anno = anno(Line), + erl_anno:set_text(String, Anno); +anno(Line, Col, #erl_scan{has_fun = false}, _String) -> + anno({Line, Col}); +anno(Line, Col, #erl_scan{has_fun = true}, []) -> + anno({Line, Col}); +anno(Line, Col, #erl_scan{has_fun = true}, String) -> + Anno = anno({Line, Col}), + erl_anno:set_text(String, Anno). + +location(Line, no_col) -> + Line; +location(Line, Col) when is_integer(Col) -> + {Line,Col}. + +-compile({inline,[anno/1,incr_column/2,new_column/2,int_column/1]}). + +anno(Location) -> + erl_anno:new(Location). + +incr_column(Col, N) when is_integer(N) -> + if + Col =:= no_col -> Col; + is_integer(Col) -> Col + N + end. + +new_column(no_col, no_col) -> no_col; +new_column(Col, Ncol) when is_integer(Ncol) -> + if + Col =:= no_col -> Col; + is_integer(Col) -> Ncol + end. + +%% Ensure an integer column for calculations +int_column(no_col) -> + 1; +int_column(Col) when is_integer(Col) -> + Col. + + +%% lists:duplicate/3 (not exported) +lists_duplicate(0, _, L) -> L; +lists_duplicate(N, X, L) -> lists_duplicate(N-1, X, [X|L]). + +%% lists:foldl/3 over lists:reverse/2 +lists_foldl_reverse(Lists, Acc) -> + lists:foldl(fun lists:reverse/2, Acc, Lists). + +nl_spcs(2) -> "\n "; +nl_spcs(3) -> "\n "; +nl_spcs(4) -> "\n "; +nl_spcs(5) -> "\n "; +nl_spcs(6) -> "\n "; +nl_spcs(7) -> "\n "; +nl_spcs(8) -> "\n "; +nl_spcs(9) -> "\n "; +nl_spcs(10) -> "\n "; +nl_spcs(11) -> "\n "; +nl_spcs(12) -> "\n "; +nl_spcs(13) -> "\n "; +nl_spcs(14) -> "\n "; +nl_spcs(15) -> "\n "; +nl_spcs(16) -> "\n "; +nl_spcs(17) -> "\n ". + +spcs(1) -> " "; +spcs(2) -> " "; +spcs(3) -> " "; +spcs(4) -> " "; +spcs(5) -> " "; +spcs(6) -> " "; +spcs(7) -> " "; +spcs(8) -> " "; +spcs(9) -> " "; +spcs(10) -> " "; +spcs(11) -> " "; +spcs(12) -> " "; +spcs(13) -> " "; +spcs(14) -> " "; +spcs(15) -> " "; +spcs(16) -> " ". + +nl_tabs(2) -> "\n\t"; +nl_tabs(3) -> "\n\t\t"; +nl_tabs(4) -> "\n\t\t\t"; +nl_tabs(5) -> "\n\t\t\t\t"; +nl_tabs(6) -> "\n\t\t\t\t\t"; +nl_tabs(7) -> "\n\t\t\t\t\t\t"; +nl_tabs(8) -> "\n\t\t\t\t\t\t\t"; +nl_tabs(9) -> "\n\t\t\t\t\t\t\t\t"; +nl_tabs(10) -> "\n\t\t\t\t\t\t\t\t\t"; +nl_tabs(11) -> "\n\t\t\t\t\t\t\t\t\t\t". + +tabs(1) -> "\t"; +tabs(2) -> "\t\t"; +tabs(3) -> "\t\t\t"; +tabs(4) -> "\t\t\t\t"; +tabs(5) -> "\t\t\t\t\t"; +tabs(6) -> "\t\t\t\t\t\t"; +tabs(7) -> "\t\t\t\t\t\t\t"; +tabs(8) -> "\t\t\t\t\t\t\t\t"; +tabs(9) -> "\t\t\t\t\t\t\t\t\t"; +tabs(10) -> "\t\t\t\t\t\t\t\t\t\t". + +%% Dynamic version of reserved_word that knows about the possibility +%% that enabled features might change the set of reserved words. +-doc "Returns `true` if `Atom` is an Erlang reserved word, otherwise `false`.". +-spec reserved_word(Atom :: atom()) -> boolean(). +reserved_word(Atom) -> + case f_reserved_word(Atom) of + true -> true; + false -> + lists:member(Atom, erl_features:keywords()) + end. + +%% Static version of reserved_words. These represent the fixed set of +%% reserved words. +-doc false. +f_reserved_word('after') -> true; +f_reserved_word('begin') -> true; +f_reserved_word('case') -> true; +f_reserved_word('try') -> true; +f_reserved_word('cond') -> true; +f_reserved_word('catch') -> true; +f_reserved_word('andalso') -> true; +f_reserved_word('orelse') -> true; +f_reserved_word('end') -> true; +f_reserved_word('fun') -> true; +f_reserved_word('if') -> true; +f_reserved_word('let') -> true; +f_reserved_word('of') -> true; +f_reserved_word('receive') -> true; +f_reserved_word('when') -> true; +f_reserved_word('bnot') -> true; +f_reserved_word('not') -> true; +f_reserved_word('div') -> true; +f_reserved_word('rem') -> true; +f_reserved_word('band') -> true; +f_reserved_word('and') -> true; +f_reserved_word('bor') -> true; +f_reserved_word('bxor') -> true; +f_reserved_word('bsl') -> true; +f_reserved_word('bsr') -> true; +f_reserved_word('or') -> true; +f_reserved_word('xor') -> true; +f_reserved_word(_) -> false. diff --git a/tests/erlang_tests/CMakeLists.txt b/tests/erlang_tests/CMakeLists.txt index 07062c4f3..24cb51203 100644 --- a/tests/erlang_tests/CMakeLists.txt +++ b/tests/erlang_tests/CMakeLists.txt @@ -592,6 +592,7 @@ add_custom_target(erlang_test_modules DEPENDS test_tuple_size.beam test_size.beam test_element.beam + test_erl_scan.beam test_setelement.beam test_insert_element.beam test_delete_element.beam diff --git a/tests/libs/estdlib/CMakeLists.txt b/tests/libs/estdlib/CMakeLists.txt index ebc4754f4..ba649eff9 100644 --- a/tests/libs/estdlib/CMakeLists.txt +++ b/tests/libs/estdlib/CMakeLists.txt @@ -26,6 +26,7 @@ set(ERLANG_MODULES test_apply test_binary test_calendar + test_erl_scan test_gen_event test_gen_server test_gen_statem diff --git a/tests/libs/estdlib/test_erl_scan.erl b/tests/libs/estdlib/test_erl_scan.erl new file mode 100644 index 000000000..0deb352b6 --- /dev/null +++ b/tests/libs/estdlib/test_erl_scan.erl @@ -0,0 +1,84 @@ +% +% This file is part of AtomVM. +% +% Copyright 2024 Jakub Gonet +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% +-module(test_erl_scan). + +-export([test/0]). + +test() -> + {Fn, ExpectedFn} = + {"fun(X) -> X + 1 end.", [ + {'fun', 1}, + {'(', 1}, + {var, 1, 'X'}, + {')', 1}, + {'->', 1}, + {var, 1, 'X'}, + {'+', 1}, + {integer, 1, 1}, + {'end', 1}, + {dot, 1} + ]}, + + {Comment, ExpectedComment} = {"% A comment", []}, + {Assignment, ExpectedAssignment} = + {"A = 5,\nB = 10.", [ + {var, 1, 'A'}, + {'=', 1}, + {integer, 1, 5}, + {',', 1}, + {var, 2, 'B'}, + {'=', 2}, + {integer, 2, 10}, + {dot, 2} + ]}, + {Maybe, ExpectedMaybe} = { + "maybe\n" + " {ok, A} ?= a(),\n" + " true = A >= 0,\n" + " A\n" + "end", + [ + {'maybe', 1}, + {'{', 2}, + {atom, 2, ok}, + {',', 2}, + {var, 2, 'A'}, + {'}', 2}, + {'?=', 2}, + {atom, 2, a}, + {'(', 2}, + {')', 2}, + {',', 2}, + {atom, 3, true}, + {'=', 3}, + {var, 3, 'A'}, + {'>=', 3}, + {integer, 3, 0}, + {',', 3}, + {var, 4, 'A'}, + {'end', 5} + ] + }, + + {ok, ExpectedFn, _} = erl_scan:string(Fn), + {ok, ExpectedComment, _} = erl_scan:string(Comment), + {ok, ExpectedAssignment, _} = erl_scan:string(Assignment), + {ok, ExpectedMaybe, _} = erl_scan:string(Maybe), + ok. diff --git a/tests/libs/estdlib/tests.erl b/tests/libs/estdlib/tests.erl index 21de828b8..8abe4d21a 100644 --- a/tests/libs/estdlib/tests.erl +++ b/tests/libs/estdlib/tests.erl @@ -47,6 +47,7 @@ get_tests(_OTPVersion) -> test_apply, test_lists, test_calendar, + test_erl_scan, test_gen_event, test_gen_server, test_gen_statem,