with Ada.Text_IO;

with Ada.Finalization;
with Ada.Unchecked_Deallocation;

package Lexer is

  --====================================================================
  -- Author    Christoph Grein
  -- Version   3.3
  -- Date      3 October 1998
  --====================================================================
  -- A LL(1) grammar parser for a stream of characters representing an
  -- Ada or Java program.
  -- Attention: Not much effort has been invested to analyse the stream
  --            for legality.
  -- Tokens are correctly recognized as long as the input string is
  -- a legal program. Otherwise some of the bad tokens might not be
  -- returned as such.
  --
  -- The lexer recognizes Ada or Java files upon initialisation. If it
  -- cannot identify the language, the Unidentifyable_Language exception
  -- is raised. Token names are appended with _A, _J, or _AJ depending
  -- on the language they are used for.
  --
  -- Note that reserved words and identifiers are separate lexical items
  -- for both, Ada (ARM 2.2(1)) and Java (JRM 3.8).
  -- However the Ada-reserved words Access, Delta, Digits, Range are
  -- also used as attribute designators (ARM 4.1.4(3,5)). The lexer
  -- always returns these items as reserved word tokens, because it is
  -- not considered the lexer's chore to do an analysis depending on the
  -- sequence of tokens:
  -- Attribute designators always follow the tick (ARM 4.1.4(2)).
  --
  -- An (Ada) operator occurring as an operator symbol (ARM 6.1(9), e.g.
  -- "=" (A, B)) is always returned as a string literal ("="), never as
  -- the corresponding token (Equal_A).
  -- Such a string literal can easily be identified as an operator in
  -- the following cases:
  -- - In a selected component (ARM 4.1.3(2)), it is preceded by a dot.
  -- - In a subprogram specification (ARM 6.1(4)) or a function call
  --   (ARM 6.4(3), it is followed by an opening parenthesis.
  -- - In a formal_subprogram_declaration as a subprogram default
  --   (ARM 12.6(2)), it is preceded by the reserved word is.
  -- Only the occurence in a generic instantiation as an explicit
  -- generic actual parameter (ARM 12.3(5)) necessitates a compilation-
  -- like analysis to discriminate between a simple string and an
  -- operator symbol, which is of course out of the capabilities of
  -- lexical parsing.
  --
  -- Get_Token reads the next token from the input string. End_Error is
  -- raised when the end of input has been reached. For each token, the
  -- start and end position in the input stream is stored. Furthermore,
  -- while the input is processed, lines and columns are counted.
  --
  -- Reset resets the parser to the given token's position in the input
  -- stream. This token and the ones following will be returned on the
  -- next calls to Get_Token. Reset_Error is raised if called with a
  -- token not from the input string.
  --====================================================================
  -- History
  -- Author Version   Date    Reason for change
  --  C.G.    0.0  25.05.1998 PDL
  --  C.G.    0.1  31.05.1998 Added bad token
  --  C.G.    1.0  17.06.1998 Final design (added reason of bad token)
  --  C.G.    2.0  01.07.1998 Make Token controlled to prevent storage
  --                          leak
  --  C.G.    2.1  06.07.1998 Size of Tab
  --  C.G.    3.0  28.07.1998 Also lex Java
  --  C.G.    3.1  05.08.1998 Added Documentation_Tag
  --  C.G.    3.2  22.09.1998 Replace
  --                            Ampersand_AJ    by Concatenate_A, And_J
  --                            Vertical_Bar_AJ by Alternative_A, Or_J
  --  C.G.    3.3  03.10.1998 Added functions Image and Tag_Pos for
  --                          documentation tags, Is_Operator
  --====================================================================

  type Token_Name is
    (-- Ada  reserved words ARM 2.9 (2)
     -- Java keywords JRM 3.9
     Abort_A, Abs_A, Abstract_AJ, Accept_A, Access_A, Aliased_A, All_A, And_A, Array_A, At_A,
     Begin_A, Body_A, Boolean_J, Break_J, Byte_J,
     Case_AJ, Catch_J, Char_J, Class_J, Const_J, Constant_A, Continue_J,
     Declare_A, Default_J, Delay_A, Delta_A, Digits_A, Do_AJ, Double_J,
     Else_AJ, Elsif_A, End_A, Entry_A, Exception_A, Exit_A, Extends_J,
     Final_J, Finally_J, Float_J, For_AJ, Function_A,
     Generic_A, Goto_AJ,
     If_AJ, Implements_J, Import_J, In_A, InstanceOf_J, Int_J, Interface_J, Is_A,
     Limited_A, Long_J, Loop_A,
     Mod_A,
     Native_J, New_AJ, Not_A, Null_A,
     Of_A, Or_A, Others_A, Out_A,
     Package_AJ, Pragma_A, Private_AJ, Procedure_A, Protected_AJ, Public_J,
     Raise_A, Range_A, Record_A, Rem_A, Renames_A, Requeue_A, Return_AJ, Reverse_A,
     Select_A, Separate_A, Short_J, Static_J, Subtype_A, Super_J, Switch_J, Synchronized_J,
     Tagged_A, Task_A, Terminate_A, Then_A, This_J, Throw_J, Throws_J, Transient_J, Try_J, Type_A,
     Until_A, Use_A,
     Void_J, Volatile_J,
     When_A, While_AJ, With_A,
     Xor_A,
     -- Ada delimiters ARM 2.2 (9)
     -- & ' ( ) * + , - . / : ; < = > |
     -- Ada compound delimiters ARM 2.2 (14)
     -- => .. ** := /= >= <= << >> <>
     -- Java separators JRM 3.11
     -- ( ) { } [ ] ; , .
     -- Java operators JRM 3.12
     -- =  >  <  !  ~  ?  :
     -- == <= >= != && || ++ --
     -- +  -  *  /  &  |  ^  %  <<  >>  >>>
     -- += -= *= /= &= |= ^= %= <<= >>= >>>=
     Colon_AJ, Comma_AJ, Dot_AJ, Semicolon_AJ, Tick_A,     -- : , . ; '
     LeftBrace_J, RightBrace_J,                            -- { }
     LeftBracket_J, RightBracket_J,                        -- [ ]
     Left_Parenthesis_AJ, Right_Parenthesis_AJ,            -- ( )
     Concatenate_A, And_J,                                 -- &
     Alternative_A, Or_J,                                  -- |
     Assignment_J, Conditional_J,                          -- = ?
     Equal_A, Greater_Than_AJ, Less_Than_AJ,               -- = > <
     Complement_J, Not_J, Xor_J,                           -- ~ ! ^
     Plus_AJ, Minus_AJ, Times_AJ, Divide_AJ, Remainder_J,  -- + - * / %
     Arrow_A, Assignment_A, Double_Dot_A, Exponentiate_A,  -- => := .. **
     Equal_J, Not_Equal_A, NotEqual_J,                     -- == /= !=
     Greater_Equal_AJ, Less_Equal_AJ,                      -- >= <=
     Left_Label_Bracket_A, Right_Label_Bracket_A, Box_A,   -- << >> <>
     Increment_J, Decrement_J,                             -- ++ --
     LeftShift_J, RightShift_J, UnsignedRightShift_J,      -- << >> >>>
     ShortCutAnd_J, ShortCutOr_J,                          -- && ||
     PlusAssign_J, MinusAssign_J,                          -- += -=
     TimesAssign_J, DivideAssign_J, RemainderAssign_J,     -- *= /= %=
     AndAssign_J, OrAssign_J, XorAssign_J,                 -- &= |= ^=
     LeftShiftAssign_J, RightShiftAssign_J,                -- <<= >>=
     UnsignedRightShiftAssign_J,                           -- >>>=
     -- Ada (ARM 2.4 .. 2.6) and Java (JRM 3.10) literals (all Java reals
     -- may use lazy forms, i.e. whole or decimal part may be missing)
     Null_J, False_J, True_J,
     Integer_AJ,               -- 1, Ada 1E+10
     Based_Integer_AJ,         -- 13#C#E+10, Java 07 (octal) 0xF (hexadecimal)
     LongInteger_J,            -- 1L
     BasedLongInteger_J,       -- 07L, 0xFL
     Real_AJ,                  -- 1.0E+10, Java 1E+10
     Based_Real_A,             -- Ada 13#C.B#E+5
     FloatNumber_J,            -- 1.0E+10F
     DoubleNumber_J,           -- 1.0E+10D
     Character_AJ, String_AJ,
     -- Ada and Java other tokens
     Identifier_AJ,
     Comment_A,             -- -- to end of line
     EndOfLineComment_J,    -- // to end of line
     TraditionalComment_J,  -- /* on one line */
     CommentHead_J,         -- /* to end of line
     CommentBody_J,         --    anything in between
     CommentTail_J,         --    to */
     Documentation_J,       -- /** on one line */
     DocumentationHead_J,   -- /** to end of line
     DocumentationBody_J,   --     anything in between
     DocumentationTail_J,   --     to */
     -- Ada and Java bad token (syntax error)
     Bad_Token_AJ);

  subtype Reserved_Word is Token_Name range Abort_A         .. Xor_A;
  subtype Delimiter     is Token_Name range Colon_AJ        .. UnsignedRightShiftAssign_J;
  subtype Literal       is Token_Name range Null_J          .. String_AJ;
  subtype Number        is Literal    range Integer_AJ      .. DoubleNumber_J;
  subtype Whole_Number  is Number     range Integer_AJ      .. BasedLongInteger_J;
  subtype Real_Number   is Number     range Real_AJ         .. DoubleNumber_J;
  subtype Comment       is Token_Name range Comment_A       .. DocumentationTail_J;
  subtype Documentation is Comment    range Documentation_J .. DocumentationTail_J;

  type Token is private;

  function Name  (Item: Token) return Token_Name;
  function Image (Item: Token) return String;
  pragma Inline (Name);

  -- Operators cannot be made a subtype of Token_Name. Thus a function
  -- is specified.
  -- Ada:  ARM 4.5(2)    and or xor
  --              (3)    = /= < <= > >=
  --              (4,5)  + - &
  --              (6)    * / mod rem
  --              (7)    ** abs not
  --       (not in operator symbol form)
  -- Java: JRM 3.12      (see above)

  function Is_Operator (Item: Token) return Boolean;

  -- Only for based numbers.

  subtype Number_Base is Ada.Text_IO.Number_Base;

  function Base (Item: Token) return Number_Base;
  pragma Inline (Base);

  -- Only for Java documentation comments.

  type Documentation_Tag is
    (No_Tag, See_Tag, Author_Tag, Version_Tag, Param_Tag, Return_Tag, Exception_Tag);
  subtype Proper_Tag is Documentation_Tag range See_Tag .. Documentation_Tag'Last;

  function Tag     (Item: Token) return Documentation_Tag;
  function Tag_Pos (Item: Token) return Natural;  -- position within image
  pragma Inline (Tag, Tag_Pos);

  function Image (Tag: Proper_Tag) return String;

  -- Only for bad tokens (if there are errors, it depends on the context
  -- when the bad token's end is assumed and which error is reported).
  -- Illegal_Literal is used for any error in identifiers or numbers that
  -- is not covered by more explicit reports.

  type Token_Error is (Non_Language_Character, -- e.g. { for Ada, ` for Java
                       Illegal_Literal,        -- e.g. non-graphic character
                       Illegal_Underline,      -- for identifiers and numbers
                       Illegal_Base,           -- for numbers
                       Missing_Base_Quote,
                       Illegal_Extended_Digit,
                       Missing_String_Quote);  -- string terminated by EOL

  function Error (Item: Token) return Token_Error;
  pragma Inline (Error);

  -- Token position within string and file

  function First (Item: Token) return Positive;
  function Last  (Item: Token) return Positive;
  function Line  (Item: Token) return Positive;
  function Col   (Item: Token) return Positive;
  pragma  Inline (First, Last, Line, Col);

  -- Input string ---------------------------------------------------------

  type String_Pointer is access all String;

  procedure Free is new Ada.Unchecked_Deallocation (String, String_Pointer);

  procedure Initialize (Input: in String_Pointer; Size_of_Tab: in Positive);

  Unidentifyable_Language: exception;

  -- Current position within Input

  function Line return Positive;
  function Col  return Positive;
  pragma Inline (Line, Col);

  function End_of_Input return Boolean;
  pragma Inline (End_of_Input);

  -- Token parser ---------------------------------------------------------

  function Get_Token return Token;

  procedure Reset (to_Token: in Token);

  End_Error, Reset_Error: exception;

private

  type Token_Core (Name: Token_Name := Comment_A) is record
    -- location of token in input string and file
    Line , Col,
    First, Last: Positive;
    case Name is
      when Number'First .. Bad_Token_AJ =>
        Image: String_Pointer;  -- illegal token if null
        case Name is
          when Based_Real_A       |
               Based_Integer_AJ   |
               BasedLongInteger_J => Base : Number_Base;
          when Documentation      => Tag  : Documentation_Tag;
                                     Pos  : Natural;
          when Bad_Token_AJ       => Error: Token_Error;
          when others             => null;
        end case;
      when others =>
        null;
    end case;
  end record;

  type Token is new Ada.Finalization.Controlled with record
    Core: Token_Core;
  end record;

  procedure Adjust   (Object: in out Token);
  procedure Finalize (Object: in out Token);

end Lexer;