never executed always true always false
    1 {-# LANGUAGE FlexibleInstances #-}
    2 {-# LANGUAGE InstanceSigs #-}
    3 {-# LANGUAGE OverloadedStrings #-}
    4 {-# LANGUAGE RecordWildCards #-}
    5 {-# LANGUAGE TypeFamilies #-}
    6 {-# LANGUAGE DeriveDataTypeable #-}
    7 {-# LANGUAGE DeriveGeneric #-}
    8 {-# LANGUAGE TypeFamilies #-}
    9 {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
   10 
   11 module Conjure.Language.Lexer
   12     ( Lexeme(..)
   13     , LexemePos(..)
   14     , textToLexeme,
   15     ETok(..),
   16     Offsets(..),
   17     Reformable(..),
   18     prettySplitComments,
   19     eLex,
   20     reformList,
   21     tokenSourcePos,
   22     sourcePosAfter,
   23     totalLength,
   24     trueLength,
   25     trueStart,
   26     tokenOffset,
   27     tokenStartOffset,
   28     sourcePos0,
   29     nullBefore,
   30     LexerError(..),
   31     runLexer,
   32     ETokenStream(..)
   33     , lexemeText
   34     ) where
   35 
   36 import Conjure.Language.Lexemes
   37 import Conjure.Prelude hiding (many, some,Text)
   38 import Data.Char (isAlpha, isAlphaNum)
   39 import Data.Void
   40 
   41 import qualified Data.Text as T
   42 import qualified Data.Text.Lazy as L
   43 import Text.Megaparsec hiding (State)
   44 import Text.Megaparsec.Char
   45 
   46 import Data.List (splitAt)
   47 import qualified Data.List.NonEmpty as NE
   48 import qualified Text.Megaparsec as L
   49 import Prelude (read)
   50 import qualified Prettyprinter as Pr
   51 import Conjure.Prelude hiding (some,many)
   52 
   53 
   54 import Text.Megaparsec.Stream ()
   55 
   56 
   57 data LexemePos = LexemePos
   58                     Lexeme          -- the lexeme
   59                     SourcePos       -- source position, the beginning of this lexeme
   60                     SourcePos       -- source position, just after this lexeme, including whitespace after the lexeme
   61     deriving (Show,Eq, Ord)
   62 
   63 sourcePos0 :: SourcePos
   64 sourcePos0 = SourcePos "" (mkPos  1) (mkPos 1)
   65 
   66 class Reformable a where
   67     reform :: a -> L.Text
   68 
   69 instance Reformable ETok where
   70     reform e | totalLength e == 0 = ""
   71     reform (ETok{capture=cap,trivia=triv}) = L.append  (L.concat $ map showTrivia triv) (L.fromStrict cap)
   72         where
   73             showTrivia :: Trivia -> L.Text
   74             showTrivia x = case x of
   75               WhiteSpace txt    -> L.fromStrict txt
   76               LineComment txt   -> L.fromStrict txt
   77               BlockComment txt  -> L.fromStrict txt
   78 
   79 reformList :: (Traversable t ,Reformable a) => t a -> L.Text
   80 reformList =  L.concat  . map reform . toList
   81 
   82 emojis :: [Char]
   83 emojis =
   84     concat
   85         [ ['\x1f600' .. '\x1F64F']
   86         , ['\x1f300' .. '\x1f5ff']
   87         , ['\x1f680' .. '\x1f999']
   88         , ['\x1f1e0' .. '\x1f1ff']
   89         ]
   90 
   91 isIdentifierFirstLetter :: Char -> Bool
   92 isIdentifierFirstLetter ch = isAlpha ch || ch `elem` ("_" :: String) || ch `elem` emojis
   93 
   94 isIdentifierLetter :: Char -> Bool
   95 isIdentifierLetter ch = isAlphaNum ch || ch `elem` ("_'" :: String) || ch `elem` emojis
   96 
   97 data Offsets = Offsets {
   98         oStart::Int, -- the starting offset of the token (including whitespace)
   99         oTotalLength::Int, -- (the total length of the the token)
  100         oTokenLength::Int, -- (the length of the token excluding trivia)
  101         oTrueStart :: SourcePos, -- start pos of the token
  102         oSourcePos::SourcePos, -- start pos of the lexeme
  103         oEndPos::SourcePos}
  104     deriving (Show, Eq, Ord , Data)
  105 type Lexer = Parsec Void Text
  106 
  107 -- type Lexer = Parsec Void Text ETokenStream
  108 
  109 data Trivia = WhiteSpace Text | LineComment Text | BlockComment Text
  110     deriving (Show, Eq, Ord , Data)
  111 
  112 data ETok = ETok
  113     { offsets :: Offsets
  114     , trivia :: [Trivia]
  115     , lexeme :: Lexeme
  116     , capture :: Text
  117     }
  118     deriving (Eq, Ord,Show , Data)
  119 
  120 instance Pr.Pretty ETok where
  121     pretty = Pr.unAnnotate . uncurry (Pr.<>) .  prettySplitComments
  122 
  123 prettySplitComments :: ETok -> (Pr.Doc ann, Pr.Doc ann)
  124 prettySplitComments (ETok _ tr _ capture) = (Pr.hcat [Pr.pretty t Pr.<> Pr.hardline | LineComment t <- tr],Pr.pretty capture)
  125 
  126 
  127 totalLength :: ETok -> Int
  128 totalLength = oTotalLength . offsets
  129 
  130 trueLength :: ETok -> Int
  131 trueLength = oTokenLength . offsets
  132 
  133 -- tokenStart :: ETok -> Int
  134 -- tokenStart (ETok{offsets = (Offsets _  s _ _ _)}) = s
  135 tokenOffset :: ETok -> Int
  136 tokenOffset = oStart . offsets
  137 tokenStartOffset :: ETok -> Int
  138 tokenStartOffset t = oStart o + (oTotalLength o - oTokenLength o)
  139     where o = offsets t
  140 
  141 trueStart :: ETok -> SourcePos
  142 trueStart = oTrueStart . offsets
  143 
  144 tokenSourcePos :: ETok -> SourcePos
  145 tokenSourcePos = oSourcePos . offsets
  146 
  147 sourcePosAfter :: ETok -> SourcePos
  148 sourcePosAfter = oEndPos . offsets
  149 
  150 makeToken :: Offsets -> [Trivia] -> Lexeme -> Text -> ETok
  151 makeToken = ETok
  152 
  153 --make an empty token that precedes the given token with the given lexeme
  154 nullBefore :: Lexeme -> ETok -> ETok
  155 nullBefore lex tok = ETok offs [] lex ""
  156     where
  157         sp = tokenSourcePos tok
  158         offs =  Offsets (tokenStartOffset tok) 0 0 sp sp sp
  159 newtype LexerError = LexerError String
  160     deriving (Show)
  161 
  162 runLexer :: Text -> Maybe FilePath -> Either LexerError ETokenStream
  163 runLexer txt fp = case runParser eLex (fromMaybe "Lexer" fp) txt of
  164   Left peb -> Left $ LexerError $ errorBundlePretty peb
  165   Right ets -> Right $ ETokenStream txt ets
  166 
  167 
  168 eLex :: Lexer [ETok]
  169 eLex =
  170     do
  171         main <- many $ try aToken
  172         end <- pEOF
  173         return $ main ++ [end]
  174 
  175 aToken :: Lexer ETok
  176 aToken = do
  177     start <- getOffset
  178     startPos <- getSourcePos
  179     whitespace <- pTrivia
  180     tokenOffset_ <- getOffset
  181     tokenStart <- getSourcePos
  182     (tok,cap) <- aLexeme
  183     tokenEnd <- getOffset
  184     endPos <- getSourcePos
  185     return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - tokenOffset_) startPos tokenStart endPos) whitespace tok cap
  186 
  187 pEOF :: Lexer ETok
  188 pEOF = do
  189     start <- getOffset
  190     startPos <- getSourcePos
  191     whitespace <- pTrivia
  192     wse <- getOffset
  193     tokenStart <- getSourcePos
  194     eof
  195     tokenEnd <- getOffset
  196     endPos <- getSourcePos
  197     return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - wse) startPos tokenStart endPos) whitespace L_EOF ""
  198 
  199 
  200 aLexeme :: Lexer (Lexeme,Text)
  201 aLexeme = aLexemeStrict <|> pFallback
  202 
  203 aLexemeStrict :: Lexer (Lexeme,Text)
  204 aLexemeStrict =
  205     try
  206         pNumber
  207         <|> try  (choice (map pLexeme lexemes) <?> "Lexeme")
  208         <|> try pIdentifier
  209         <|> try pQuotedIdentifier
  210         <|> try pMetaVar
  211 
  212 
  213 pNumber :: Lexer (Lexeme,Text)
  214 pNumber = do
  215     v <- takeWhile1P Nothing (`elem` ['1','2','3','4','5','6','7','8','9','0'])
  216     let n = read $ T.unpack v
  217     return (LIntLiteral n,v)
  218     <?> "Numeric Literal"
  219 
  220 pMetaVar :: Lexer (Lexeme,Text)
  221 pMetaVar = do
  222     amp <- chunk "&"
  223     (_,cap) <- pIdentifier
  224     return (LMetaVar cap,amp `T.append` cap)
  225 
  226 pIdentifier :: Lexer (Lexeme,Text)
  227 pIdentifier = do
  228     firstLetter <- takeWhile1P Nothing isIdentifierFirstLetter
  229     rest <- takeWhileP Nothing isIdentifierLetter
  230     let ident = T.append firstLetter rest
  231     -- traceM $ T.unpack . T.pack $ map chr $ map ord $ T.unpack ident
  232     return ( LIdentifier ident, ident)
  233     <?> "Identifier"
  234 
  235 pQuotedIdentifier :: Lexer (Lexeme,Text)
  236 pQuotedIdentifier = do
  237     l <- quoted
  238     return (LIdentifier l,l)
  239 
  240 pFallback :: Lexer (Lexeme,Text)
  241 pFallback = do
  242     q <- T.pack <$> someTill anySingle (lookAhead $ try somethingValid)
  243     return (LUnexpected  q,q)
  244   where
  245     somethingValid :: Lexer ()
  246     somethingValid = void pTrivia <|> void aLexemeStrict <|> eof
  247 
  248 pLexeme :: (Text, Lexeme) -> Lexer (Lexeme,Text)
  249 pLexeme (s, l) = do
  250     tok <- string s
  251     notFollowedBy $ if isIdentifierLetter $ T.last tok then nonIden else empty
  252     return (l,tok)
  253     <?> "Lexeme :" ++ show l
  254     where
  255         nonIden = takeWhile1P Nothing isIdentifierLetter
  256 
  257 pTrivia :: Lexer [Trivia]
  258 pTrivia = many (whiteSpace <|> lineComment <|> blockComment)
  259 
  260 whiteSpace :: Lexer Trivia
  261 whiteSpace = do
  262     s <- some spaceChar
  263     return $ WhiteSpace $ T.pack s
  264 
  265 quoted :: Lexer Text
  266 quoted = do
  267     open <- char '\"'
  268     (body,end) <- manyTill_ anySingle $ char '\"'
  269     return $ T.pack  $ open:body++[end]
  270 
  271 lineEnd :: Lexer [Char]
  272 lineEnd = T.unpack <$> eol <|> ( eof >> return [])
  273 
  274 lineComment :: Lexer Trivia
  275 lineComment = do
  276     _ <- try (chunk "$")
  277     (text,end) <- manyTill_ anySingle lineEnd
  278     return $ LineComment $ T.pack ('$' : text++end)
  279 
  280 blockComment :: Lexer Trivia
  281 blockComment = do
  282     _ <- try (chunk "/*")
  283     text <- manyTill L.anySingle (lookAhead (void(chunk "*/") <|>eof))
  284     cl <- optional $ chunk "*/"
  285     let cl' = fromMaybe "" cl
  286     return $ BlockComment $ T.concat ["/*",T.pack text ,cl' ]
  287 
  288 
  289 data ETokenStream = ETokenStream
  290     { streamSourceText :: Text
  291     , streamTokens :: [ETok]
  292     }
  293 instance Stream ETokenStream where
  294     type Token ETokenStream = ETok
  295     type Tokens ETokenStream = [ETok]
  296     tokenToChunk _ x = [x]
  297     tokensToChunk _ xs = xs
  298     chunkToTokens _ = id
  299     chunkLength _ = length
  300     chunkEmpty _ [] = True
  301     chunkEmpty _ _ = False
  302     take1_ :: ETokenStream -> Maybe (Token ETokenStream, ETokenStream)
  303     take1_ (ETokenStream _ (x : xs)) = Just (x, buildStream xs)
  304     take1_ (ETokenStream _ []) = Nothing
  305     takeN_ :: Int -> ETokenStream -> Maybe (Tokens ETokenStream, ETokenStream)
  306     takeN_ n xs | n <= 0 = Just ([], xs)
  307     takeN_ _ (ETokenStream _ []) = Nothing
  308     takeN_ n (ETokenStream _ xs) = Just (take n xs, buildStream $ drop n xs)
  309     takeWhile_ :: (Token ETokenStream -> Bool) -> ETokenStream -> (Tokens ETokenStream, ETokenStream)
  310     takeWhile_ p (ETokenStream _ xs) =
  311         (a, buildStream b)
  312       where
  313         (a, b) = span p xs
  314 
  315 -- (takeWhile p xs,ETokenStream $ dropWhile p xs)
  316 
  317 buildStream :: [ETok] -> ETokenStream
  318 buildStream xs = case NE.nonEmpty xs of
  319     Nothing -> ETokenStream "" xs
  320     Just _ -> ETokenStream (T.pack "showTokens pxy s") xs
  321 
  322 instance VisualStream ETokenStream where
  323     showTokens _ =  L.unpack . reformList
  324     tokensLength _ = sum . fmap trueLength
  325 
  326 -- https://markkarpov.com/tutorial/megaparsec.html#working-with-custom-input-streams
  327 instance TraversableStream ETokenStream where
  328     reachOffset o PosState{..} =
  329         ( Just (prefix ++ restOfLine)
  330         , PosState
  331             { pstateInput = buildStream post
  332             , pstateOffset = max pstateOffset o
  333             , pstateSourcePos = newSourcePos
  334             , pstateTabWidth = pstateTabWidth
  335             , pstateLinePrefix = prefix
  336             }
  337         )
  338       where
  339         prefix =
  340             if sameLine
  341                 then pstateLinePrefix ++ preLine
  342                 else preLine
  343         sameLine = sourceLine newSourcePos == sourceLine pstateSourcePos
  344         newSourcePos =
  345             case post of
  346                 [] -> pstateSourcePos
  347                 (x : _) -> tokenSourcePos x
  348         (pre, post) :: ([ETok], [ETok]) = splitAt (o - pstateOffset) (streamTokens pstateInput)
  349         (preStr, postStr) = (maybe "" (showTokens pxy) (NE.nonEmpty pre), maybe "" (showTokens pxy) (NE.nonEmpty post))
  350         preLine = reverse . takeWhile (/= '\n') . reverse $ preStr
  351         restOfLine = takeWhile (/= '\n') postStr
  352 
  353 pxy :: Proxy ETokenStream
  354 pxy = Proxy
  355 
  356 
  357 
  358 -- instance Show ETok where
  359 --     show (ETok _ _ _ q) = show q
  360 
  361 
  362 
  363 -- instance TraversableStream ETokenStream where
  364 --     reachOffset i s = (Nothing, s)
  365 --     reachOffsetNoLine i s = s