never executed always true always false
    1 {-# LANGUAGE FlexibleInstances #-}
    2 {-# LANGUAGE InstanceSigs #-}
    3 {-# LANGUAGE OverloadedStrings #-}
    4 {-# LANGUAGE RecordWildCards #-}
    5 {-# LANGUAGE TypeFamilies #-}
    6 {-# LANGUAGE DeriveDataTypeable #-}
    7 {-# LANGUAGE DeriveGeneric #-}
    8 {-# LANGUAGE TypeFamilies #-}
    9 {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
   10 {-# HLINT ignore "Use camelCase" #-}
   11 
   12 module Conjure.Language.Lexer
   13     ( Lexeme(..)
   14     , LexemePos(..)
   15     , textToLexeme,
   16     ETok(..),
   17     Offsets(..),
   18     Reformable(..),
   19     prettySplitComments,
   20     eLex,
   21     reformList,
   22     tokenSourcePos,
   23     sourcePosAfter,
   24     totalLength,
   25     trueLength,
   26     trueStart,
   27     tokenOffset,
   28     tokenStartOffset,
   29     sourcePos0,
   30     nullBefore,
   31     LexerError(..),
   32     runLexer,
   33     ETokenStream(..)
   34     , lexemeText
   35     ) where
   36 
   37 import Conjure.Language.Lexemes
   38 import Conjure.Prelude hiding (many, some,Text)
   39 import Data.Char (isAlpha, isAlphaNum)
   40 import Data.Void
   41 
   42 import qualified Data.Text as T
   43 import qualified Data.Text.Lazy as L
   44 import Text.Megaparsec hiding (State)
   45 import Text.Megaparsec.Char
   46 
   47 import Data.List (splitAt)
   48 import qualified Data.List.NonEmpty as NE
   49 import qualified Text.Megaparsec as L
   50 import Prelude (read)
   51 import qualified Prettyprinter as Pr
   52 import Conjure.Prelude hiding (some,many)
   53 
   54 
   55 import Text.Megaparsec.Stream ()
   56 
   57 
   58 data LexemePos = LexemePos
   59                     Lexeme          -- the lexeme
   60                     SourcePos       -- source position, the beginning of this lexeme
   61                     SourcePos       -- source position, just after this lexeme, including whitespace after the lexeme
   62     deriving (Show,Eq, Ord)
   63 
   64 sourcePos0 :: SourcePos
   65 sourcePos0 = SourcePos "" (mkPos  1) (mkPos 1)
   66 
   67 class Reformable a where
   68     reform :: a -> L.Text
   69 
   70 instance Reformable ETok where
   71     reform e | totalLength e == 0 = ""
   72     reform (ETok{capture=cap,trivia=triv}) = L.append  (L.concat $ map showTrivia triv) (L.fromStrict cap)
   73         where
   74             showTrivia :: Trivia -> L.Text
   75             showTrivia x = case x of
   76               WhiteSpace txt    -> L.fromStrict txt
   77               LineComment txt   -> L.fromStrict txt
   78               BlockComment txt  -> L.fromStrict txt
   79 
   80 reformList :: (Traversable t ,Reformable a) => t a -> L.Text
   81 reformList =  L.concat  . map reform . toList
   82 
   83 emojis :: [Char]
   84 emojis =
   85     concat
   86         [ ['\x1f600' .. '\x1F64F']
   87         , ['\x1f300' .. '\x1f5ff']
   88         , ['\x1f680' .. '\x1f999']
   89         , ['\x1f1e0' .. '\x1f1ff']
   90         ]
   91 
   92 isIdentifierFirstLetter :: Char -> Bool
   93 isIdentifierFirstLetter ch = isAlpha ch || ch `elem` ("_" :: String) || ch `elem` emojis
   94 
   95 isIdentifierLetter :: Char -> Bool
   96 isIdentifierLetter ch = isAlphaNum ch || ch `elem` ("_'" :: String) || ch `elem` emojis
   97 
   98 data Offsets = Offsets {
   99         oStart::Int, -- the starting offset of the token (including whitespace)
  100         oTotalLength::Int, -- (the total length of the the token)
  101         oTokenLength::Int, -- (the length of the token excluding trivia)
  102         oTrueStart :: SourcePos, -- start pos of the token
  103         oSourcePos::SourcePos, -- start pos of the lexeme
  104         oEndPos::SourcePos}
  105     deriving (Show, Eq, Ord , Data)
  106 type Lexer = Parsec Void Text
  107 
  108 -- type Lexer = Parsec Void Text ETokenStream
  109 
  110 data Trivia = WhiteSpace Text | LineComment Text | BlockComment Text
  111     deriving (Show, Eq, Ord , Data)
  112 
  113 data ETok = ETok
  114     { offsets :: Offsets
  115     , trivia :: [Trivia]
  116     , lexeme :: Lexeme
  117     , capture :: Text
  118     }
  119     deriving (Eq, Ord,Show , Data)
  120 
  121 instance Pr.Pretty ETok where
  122     pretty = Pr.unAnnotate . uncurry (Pr.<>) .  prettySplitComments
  123 
  124 prettySplitComments :: ETok -> (Pr.Doc ann, Pr.Doc ann)
  125 prettySplitComments (ETok _ tr _ capture) = (Pr.hcat [Pr.pretty t Pr.<> Pr.hardline | LineComment t <- tr],Pr.pretty capture)
  126 
  127 
  128 totalLength :: ETok -> Int
  129 totalLength = oTotalLength . offsets
  130 
  131 trueLength :: ETok -> Int
  132 trueLength = oTokenLength . offsets
  133 
  134 -- tokenStart :: ETok -> Int
  135 -- tokenStart (ETok{offsets = (Offsets _  s _ _ _)}) = s
  136 tokenOffset :: ETok -> Int
  137 tokenOffset = oStart . offsets
  138 tokenStartOffset :: ETok -> Int
  139 tokenStartOffset t = oStart o + (oTotalLength o - oTokenLength o)
  140     where o = offsets t
  141 
  142 trueStart :: ETok -> SourcePos
  143 trueStart = oTrueStart . offsets
  144 
  145 tokenSourcePos :: ETok -> SourcePos
  146 tokenSourcePos = oSourcePos . offsets
  147 
  148 sourcePosAfter :: ETok -> SourcePos
  149 sourcePosAfter = oEndPos . offsets
  150 
  151 makeToken :: Offsets -> [Trivia] -> Lexeme -> Text -> ETok
  152 makeToken = ETok
  153 
  154 --make an empty token that precedes the given token with the given lexeme
  155 nullBefore :: Lexeme -> ETok -> ETok
  156 nullBefore lex tok = ETok offs [] lex ""
  157     where
  158         sp = tokenSourcePos tok
  159         offs =  Offsets (tokenStartOffset tok) 0 0 sp sp sp
  160 newtype LexerError = LexerError String
  161     deriving (Show)
  162 
  163 runLexer :: Text -> Maybe FilePath -> Either LexerError ETokenStream
  164 runLexer txt fp = case runParser eLex (fromMaybe "Lexer" fp) txt of
  165   Left peb -> Left $ LexerError $ errorBundlePretty peb
  166   Right ets -> Right $ ETokenStream txt ets
  167 
  168 
  169 eLex :: Lexer [ETok]
  170 eLex =
  171     do
  172         main <- many $ try aToken
  173         end <- pEOF
  174         return $ main ++ [end]
  175 
  176 aToken :: Lexer ETok
  177 aToken = do
  178     start <- getOffset
  179     startPos <- getSourcePos
  180     whitespace <- pTrivia
  181     tokenOffset_ <- getOffset
  182     tokenStart <- getSourcePos
  183     (tok,cap) <- aLexeme
  184     tokenEnd <- getOffset
  185     endPos <- getSourcePos
  186     return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - tokenOffset_) startPos tokenStart endPos) whitespace tok cap
  187 
  188 pEOF :: Lexer ETok
  189 pEOF = do
  190     start <- getOffset
  191     startPos <- getSourcePos
  192     whitespace <- pTrivia
  193     wse <- getOffset
  194     tokenStart <- getSourcePos
  195     eof
  196     tokenEnd <- getOffset
  197     endPos <- getSourcePos
  198     return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - wse) startPos tokenStart endPos) whitespace L_EOF ""
  199 
  200 
  201 aLexeme :: Lexer (Lexeme,Text)
  202 aLexeme = aLexemeStrict <|> pFallback
  203 
  204 aLexemeStrict :: Lexer (Lexeme,Text)
  205 aLexemeStrict =
  206     try
  207         pNumber
  208         <|> try  (choice (map pLexeme lexemes) <?> "Lexeme")
  209         <|> try pIdentifier
  210         <|> try pQuotedIdentifier
  211         <|> try pMetaVar
  212 
  213 
  214 pNumber :: Lexer (Lexeme,Text)
  215 pNumber = do
  216     v <- takeWhile1P Nothing (`elem` ['1','2','3','4','5','6','7','8','9','0'])
  217     let n = read $ T.unpack v
  218     return (LIntLiteral n,v)
  219     <?> "Numeric Literal"
  220 
  221 pMetaVar :: Lexer (Lexeme,Text)
  222 pMetaVar = do
  223     amp <- chunk "&"
  224     (_,cap) <- pIdentifier
  225     return (LMetaVar cap,amp `T.append` cap)
  226 
  227 pIdentifier :: Lexer (Lexeme,Text)
  228 pIdentifier = do
  229     firstLetter <- takeWhile1P Nothing isIdentifierFirstLetter
  230     rest <- takeWhileP Nothing isIdentifierLetter
  231     let ident = T.append firstLetter rest
  232     -- traceM $ T.unpack . T.pack $ map chr $ map ord $ T.unpack ident
  233     return ( LIdentifier ident, ident)
  234     <?> "Identifier"
  235 
  236 pQuotedIdentifier :: Lexer (Lexeme,Text)
  237 pQuotedIdentifier = do
  238     l <- quoted
  239     return (LIdentifier l,l)
  240 
  241 pFallback :: Lexer (Lexeme,Text)
  242 pFallback = do
  243     q <- T.pack <$> someTill anySingle (lookAhead $ try somethingValid)
  244     return (LUnexpected  q,q)
  245   where
  246     somethingValid :: Lexer ()
  247     somethingValid = void pTrivia <|> void aLexemeStrict <|> eof
  248 
  249 pLexeme :: (Text, Lexeme) -> Lexer (Lexeme,Text)
  250 pLexeme (s, l) = do
  251     tok <- string s
  252     notFollowedBy $ if isIdentifierLetter $ T.last tok then nonIden else empty
  253     return (l,tok)
  254     <?> "Lexeme :" ++ show l
  255     where
  256         nonIden = takeWhile1P Nothing isIdentifierLetter
  257 
  258 pTrivia :: Lexer [Trivia]
  259 pTrivia = many (whiteSpace <|> lineComment <|> blockComment)
  260 
  261 whiteSpace :: Lexer Trivia
  262 whiteSpace = do
  263     s <- some spaceChar
  264     return $ WhiteSpace $ T.pack s
  265 
  266 quoted :: Lexer Text
  267 quoted = do
  268     open <- char '\"'
  269     (body,end) <- manyTill_ anySingle $ char '\"'
  270     return $ T.pack  $ open:body++[end]
  271 
  272 lineEnd :: Lexer [Char]
  273 lineEnd = T.unpack <$> eol <|> ( eof >> return [])
  274 
  275 lineComment :: Lexer Trivia
  276 lineComment = do
  277     _ <- try (chunk "$")
  278     (text,end) <- manyTill_ anySingle lineEnd
  279     return $ LineComment $ T.pack ('$' : text++end)
  280 
  281 blockComment :: Lexer Trivia
  282 blockComment = do
  283     _ <- try (chunk "/*")
  284     text <- manyTill L.anySingle (lookAhead (void(chunk "*/") <|>eof))
  285     cl <- optional $ chunk "*/"
  286     let cl' = fromMaybe "" cl
  287     return $ BlockComment $ T.concat ["/*",T.pack text ,cl' ]
  288 
  289 
  290 data ETokenStream = ETokenStream
  291     { streamSourceText :: Text
  292     , streamTokens :: [ETok]
  293     }
  294 instance Stream ETokenStream where
  295     type Token ETokenStream = ETok
  296     type Tokens ETokenStream = [ETok]
  297     tokenToChunk _ x = [x]
  298     tokensToChunk _ xs = xs
  299     chunkToTokens _ = id
  300     chunkLength _ = length
  301     chunkEmpty _ [] = True
  302     chunkEmpty _ _ = False
  303     take1_ :: ETokenStream -> Maybe (Token ETokenStream, ETokenStream)
  304     take1_ (ETokenStream _ (x : xs)) = Just (x, buildStream xs)
  305     take1_ (ETokenStream _ []) = Nothing
  306     takeN_ :: Int -> ETokenStream -> Maybe (Tokens ETokenStream, ETokenStream)
  307     takeN_ n xs | n <= 0 = Just ([], xs)
  308     takeN_ _ (ETokenStream _ []) = Nothing
  309     takeN_ n (ETokenStream _ xs) = Just (take n xs, buildStream $ drop n xs)
  310     takeWhile_ :: (Token ETokenStream -> Bool) -> ETokenStream -> (Tokens ETokenStream, ETokenStream)
  311     takeWhile_ p (ETokenStream _ xs) =
  312         (a, buildStream b)
  313       where
  314         (a, b) = span p xs
  315 
  316 -- (takeWhile p xs,ETokenStream $ dropWhile p xs)
  317 
  318 buildStream :: [ETok] -> ETokenStream
  319 buildStream xs = case NE.nonEmpty xs of
  320     Nothing -> ETokenStream "" xs
  321     Just _ -> ETokenStream (T.pack "showTokens pxy s") xs
  322 
  323 instance VisualStream ETokenStream where
  324     showTokens _ =  L.unpack . reformList
  325     tokensLength _ = sum . fmap trueLength
  326 
  327 -- https://markkarpov.com/tutorial/megaparsec.html#working-with-custom-input-streams
  328 instance TraversableStream ETokenStream where
  329     reachOffset o PosState{..} =
  330         ( Just (prefix ++ restOfLine)
  331         , PosState
  332             { pstateInput = buildStream post
  333             , pstateOffset = max pstateOffset o
  334             , pstateSourcePos = newSourcePos
  335             , pstateTabWidth = pstateTabWidth
  336             , pstateLinePrefix = prefix
  337             }
  338         )
  339       where
  340         prefix =
  341             if sameLine
  342                 then pstateLinePrefix ++ preLine
  343                 else preLine
  344         sameLine = sourceLine newSourcePos == sourceLine pstateSourcePos
  345         newSourcePos =
  346             case post of
  347                 [] -> pstateSourcePos
  348                 (x : _) -> tokenSourcePos x
  349         (pre, post) :: ([ETok], [ETok]) = splitAt (o - pstateOffset) (streamTokens pstateInput)
  350         (preStr, postStr) = (maybe "" (showTokens pxy) (NE.nonEmpty pre), maybe "" (showTokens pxy) (NE.nonEmpty post))
  351         preLine = reverse . takeWhile (/= '\n') . reverse $ preStr
  352         restOfLine = takeWhile (/= '\n') postStr
  353 
  354 pxy :: Proxy ETokenStream
  355 pxy = Proxy
  356 
  357 
  358 
  359 -- instance Show ETok where
  360 --     show (ETok _ _ _ q) = show q
  361 
  362 
  363 
  364 -- instance TraversableStream ETokenStream where
  365 --     reachOffset i s = (Nothing, s)
  366 --     reachOffsetNoLine i s = s