never executed always true always false
1 {-# LANGUAGE FlexibleInstances #-}
2 {-# LANGUAGE InstanceSigs #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE RecordWildCards #-}
5 {-# LANGUAGE TypeFamilies #-}
6 {-# LANGUAGE DeriveDataTypeable #-}
7 {-# LANGUAGE DeriveGeneric #-}
8 {-# LANGUAGE TypeFamilies #-}
9 {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
10 {-# HLINT ignore "Use camelCase" #-}
11
12 module Conjure.Language.Lexer
13 ( Lexeme(..)
14 , LexemePos(..)
15 , textToLexeme,
16 ETok(..),
17 Offsets(..),
18 Reformable(..),
19 prettySplitComments,
20 eLex,
21 reformList,
22 tokenSourcePos,
23 sourcePosAfter,
24 totalLength,
25 trueLength,
26 trueStart,
27 tokenOffset,
28 tokenStartOffset,
29 sourcePos0,
30 nullBefore,
31 LexerError(..),
32 runLexer,
33 ETokenStream(..)
34 , lexemeText
35 ) where
36
37 import Conjure.Language.Lexemes
38 import Conjure.Prelude hiding (many, some,Text)
39 import Data.Char (isAlpha, isAlphaNum)
40 import Data.Void
41
42 import qualified Data.Text as T
43 import qualified Data.Text.Lazy as L
44 import Text.Megaparsec hiding (State)
45 import Text.Megaparsec.Char
46
47 import Data.List (splitAt)
48 import qualified Data.List.NonEmpty as NE
49 import qualified Text.Megaparsec as L
50 import Prelude (read)
51 import qualified Prettyprinter as Pr
52 import Conjure.Prelude hiding (some,many)
53
54
55 import Text.Megaparsec.Stream ()
56
57
58 data LexemePos = LexemePos
59 Lexeme -- the lexeme
60 SourcePos -- source position, the beginning of this lexeme
61 SourcePos -- source position, just after this lexeme, including whitespace after the lexeme
62 deriving (Show,Eq, Ord)
63
64 sourcePos0 :: SourcePos
65 sourcePos0 = SourcePos "" (mkPos 1) (mkPos 1)
66
67 class Reformable a where
68 reform :: a -> L.Text
69
70 instance Reformable ETok where
71 reform e | totalLength e == 0 = ""
72 reform (ETok{capture=cap,trivia=triv}) = L.append (L.concat $ map showTrivia triv) (L.fromStrict cap)
73 where
74 showTrivia :: Trivia -> L.Text
75 showTrivia x = case x of
76 WhiteSpace txt -> L.fromStrict txt
77 LineComment txt -> L.fromStrict txt
78 BlockComment txt -> L.fromStrict txt
79
80 reformList :: (Traversable t ,Reformable a) => t a -> L.Text
81 reformList = L.concat . map reform . toList
82
83 emojis :: [Char]
84 emojis =
85 concat
86 [ ['\x1f600' .. '\x1F64F']
87 , ['\x1f300' .. '\x1f5ff']
88 , ['\x1f680' .. '\x1f999']
89 , ['\x1f1e0' .. '\x1f1ff']
90 ]
91
92 isIdentifierFirstLetter :: Char -> Bool
93 isIdentifierFirstLetter ch = isAlpha ch || ch `elem` ("_" :: String) || ch `elem` emojis
94
95 isIdentifierLetter :: Char -> Bool
96 isIdentifierLetter ch = isAlphaNum ch || ch `elem` ("_'" :: String) || ch `elem` emojis
97
98 data Offsets = Offsets {
99 oStart::Int, -- the starting offset of the token (including whitespace)
100 oTotalLength::Int, -- (the total length of the the token)
101 oTokenLength::Int, -- (the length of the token excluding trivia)
102 oTrueStart :: SourcePos, -- start pos of the token
103 oSourcePos::SourcePos, -- start pos of the lexeme
104 oEndPos::SourcePos}
105 deriving (Show, Eq, Ord , Data)
106 type Lexer = Parsec Void Text
107
108 -- type Lexer = Parsec Void Text ETokenStream
109
110 data Trivia = WhiteSpace Text | LineComment Text | BlockComment Text
111 deriving (Show, Eq, Ord , Data)
112
113 data ETok = ETok
114 { offsets :: Offsets
115 , trivia :: [Trivia]
116 , lexeme :: Lexeme
117 , capture :: Text
118 }
119 deriving (Eq, Ord,Show , Data)
120
121 instance Pr.Pretty ETok where
122 pretty = Pr.unAnnotate . uncurry (Pr.<>) . prettySplitComments
123
124 prettySplitComments :: ETok -> (Pr.Doc ann, Pr.Doc ann)
125 prettySplitComments (ETok _ tr _ capture) = (Pr.hcat [Pr.pretty t Pr.<> Pr.hardline | LineComment t <- tr],Pr.pretty capture)
126
127
128 totalLength :: ETok -> Int
129 totalLength = oTotalLength . offsets
130
131 trueLength :: ETok -> Int
132 trueLength = oTokenLength . offsets
133
134 -- tokenStart :: ETok -> Int
135 -- tokenStart (ETok{offsets = (Offsets _ s _ _ _)}) = s
136 tokenOffset :: ETok -> Int
137 tokenOffset = oStart . offsets
138 tokenStartOffset :: ETok -> Int
139 tokenStartOffset t = oStart o + (oTotalLength o - oTokenLength o)
140 where o = offsets t
141
142 trueStart :: ETok -> SourcePos
143 trueStart = oTrueStart . offsets
144
145 tokenSourcePos :: ETok -> SourcePos
146 tokenSourcePos = oSourcePos . offsets
147
148 sourcePosAfter :: ETok -> SourcePos
149 sourcePosAfter = oEndPos . offsets
150
151 makeToken :: Offsets -> [Trivia] -> Lexeme -> Text -> ETok
152 makeToken = ETok
153
154 --make an empty token that precedes the given token with the given lexeme
155 nullBefore :: Lexeme -> ETok -> ETok
156 nullBefore lex tok = ETok offs [] lex ""
157 where
158 sp = tokenSourcePos tok
159 offs = Offsets (tokenStartOffset tok) 0 0 sp sp sp
160 newtype LexerError = LexerError String
161 deriving (Show)
162
163 runLexer :: Text -> Maybe FilePath -> Either LexerError ETokenStream
164 runLexer txt fp = case runParser eLex (fromMaybe "Lexer" fp) txt of
165 Left peb -> Left $ LexerError $ errorBundlePretty peb
166 Right ets -> Right $ ETokenStream txt ets
167
168
169 eLex :: Lexer [ETok]
170 eLex =
171 do
172 main <- many $ try aToken
173 end <- pEOF
174 return $ main ++ [end]
175
176 aToken :: Lexer ETok
177 aToken = do
178 start <- getOffset
179 startPos <- getSourcePos
180 whitespace <- pTrivia
181 tokenOffset_ <- getOffset
182 tokenStart <- getSourcePos
183 (tok,cap) <- aLexeme
184 tokenEnd <- getOffset
185 endPos <- getSourcePos
186 return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - tokenOffset_) startPos tokenStart endPos) whitespace tok cap
187
188 pEOF :: Lexer ETok
189 pEOF = do
190 start <- getOffset
191 startPos <- getSourcePos
192 whitespace <- pTrivia
193 wse <- getOffset
194 tokenStart <- getSourcePos
195 eof
196 tokenEnd <- getOffset
197 endPos <- getSourcePos
198 return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - wse) startPos tokenStart endPos) whitespace L_EOF ""
199
200
201 aLexeme :: Lexer (Lexeme,Text)
202 aLexeme = aLexemeStrict <|> pFallback
203
204 aLexemeStrict :: Lexer (Lexeme,Text)
205 aLexemeStrict =
206 try
207 pNumber
208 <|> try (choice (map pLexeme lexemes) <?> "Lexeme")
209 <|> try pIdentifier
210 <|> try pQuotedIdentifier
211 <|> try pMetaVar
212
213
214 pNumber :: Lexer (Lexeme,Text)
215 pNumber = do
216 v <- takeWhile1P Nothing (`elem` ['1','2','3','4','5','6','7','8','9','0'])
217 let n = read $ T.unpack v
218 return (LIntLiteral n,v)
219 <?> "Numeric Literal"
220
221 pMetaVar :: Lexer (Lexeme,Text)
222 pMetaVar = do
223 amp <- chunk "&"
224 (_,cap) <- pIdentifier
225 return (LMetaVar cap,amp `T.append` cap)
226
227 pIdentifier :: Lexer (Lexeme,Text)
228 pIdentifier = do
229 firstLetter <- takeWhile1P Nothing isIdentifierFirstLetter
230 rest <- takeWhileP Nothing isIdentifierLetter
231 let ident = T.append firstLetter rest
232 -- traceM $ T.unpack . T.pack $ map chr $ map ord $ T.unpack ident
233 return ( LIdentifier ident, ident)
234 <?> "Identifier"
235
236 pQuotedIdentifier :: Lexer (Lexeme,Text)
237 pQuotedIdentifier = do
238 l <- quoted
239 return (LIdentifier l,l)
240
241 pFallback :: Lexer (Lexeme,Text)
242 pFallback = do
243 q <- T.pack <$> someTill anySingle (lookAhead $ try somethingValid)
244 return (LUnexpected q,q)
245 where
246 somethingValid :: Lexer ()
247 somethingValid = void pTrivia <|> void aLexemeStrict <|> eof
248
249 pLexeme :: (Text, Lexeme) -> Lexer (Lexeme,Text)
250 pLexeme (s, l) = do
251 tok <- string s
252 notFollowedBy $ if isIdentifierLetter $ T.last tok then nonIden else empty
253 return (l,tok)
254 <?> "Lexeme :" ++ show l
255 where
256 nonIden = takeWhile1P Nothing isIdentifierLetter
257
258 pTrivia :: Lexer [Trivia]
259 pTrivia = many (whiteSpace <|> lineComment <|> blockComment)
260
261 whiteSpace :: Lexer Trivia
262 whiteSpace = do
263 s <- some spaceChar
264 return $ WhiteSpace $ T.pack s
265
266 quoted :: Lexer Text
267 quoted = do
268 open <- char '\"'
269 (body,end) <- manyTill_ anySingle $ char '\"'
270 return $ T.pack $ open:body++[end]
271
272 lineEnd :: Lexer [Char]
273 lineEnd = T.unpack <$> eol <|> ( eof >> return [])
274
275 lineComment :: Lexer Trivia
276 lineComment = do
277 _ <- try (chunk "$")
278 (text,end) <- manyTill_ anySingle lineEnd
279 return $ LineComment $ T.pack ('$' : text++end)
280
281 blockComment :: Lexer Trivia
282 blockComment = do
283 _ <- try (chunk "/*")
284 text <- manyTill L.anySingle (lookAhead (void(chunk "*/") <|>eof))
285 cl <- optional $ chunk "*/"
286 let cl' = fromMaybe "" cl
287 return $ BlockComment $ T.concat ["/*",T.pack text ,cl' ]
288
289
290 data ETokenStream = ETokenStream
291 { streamSourceText :: Text
292 , streamTokens :: [ETok]
293 }
294 instance Stream ETokenStream where
295 type Token ETokenStream = ETok
296 type Tokens ETokenStream = [ETok]
297 tokenToChunk _ x = [x]
298 tokensToChunk _ xs = xs
299 chunkToTokens _ = id
300 chunkLength _ = length
301 chunkEmpty _ [] = True
302 chunkEmpty _ _ = False
303 take1_ :: ETokenStream -> Maybe (Token ETokenStream, ETokenStream)
304 take1_ (ETokenStream _ (x : xs)) = Just (x, buildStream xs)
305 take1_ (ETokenStream _ []) = Nothing
306 takeN_ :: Int -> ETokenStream -> Maybe (Tokens ETokenStream, ETokenStream)
307 takeN_ n xs | n <= 0 = Just ([], xs)
308 takeN_ _ (ETokenStream _ []) = Nothing
309 takeN_ n (ETokenStream _ xs) = Just (take n xs, buildStream $ drop n xs)
310 takeWhile_ :: (Token ETokenStream -> Bool) -> ETokenStream -> (Tokens ETokenStream, ETokenStream)
311 takeWhile_ p (ETokenStream _ xs) =
312 (a, buildStream b)
313 where
314 (a, b) = span p xs
315
316 -- (takeWhile p xs,ETokenStream $ dropWhile p xs)
317
318 buildStream :: [ETok] -> ETokenStream
319 buildStream xs = case NE.nonEmpty xs of
320 Nothing -> ETokenStream "" xs
321 Just _ -> ETokenStream (T.pack "showTokens pxy s") xs
322
323 instance VisualStream ETokenStream where
324 showTokens _ = L.unpack . reformList
325 tokensLength _ = sum . fmap trueLength
326
327 -- https://markkarpov.com/tutorial/megaparsec.html#working-with-custom-input-streams
328 instance TraversableStream ETokenStream where
329 reachOffset o PosState{..} =
330 ( Just (prefix ++ restOfLine)
331 , PosState
332 { pstateInput = buildStream post
333 , pstateOffset = max pstateOffset o
334 , pstateSourcePos = newSourcePos
335 , pstateTabWidth = pstateTabWidth
336 , pstateLinePrefix = prefix
337 }
338 )
339 where
340 prefix =
341 if sameLine
342 then pstateLinePrefix ++ preLine
343 else preLine
344 sameLine = sourceLine newSourcePos == sourceLine pstateSourcePos
345 newSourcePos =
346 case post of
347 [] -> pstateSourcePos
348 (x : _) -> tokenSourcePos x
349 (pre, post) :: ([ETok], [ETok]) = splitAt (o - pstateOffset) (streamTokens pstateInput)
350 (preStr, postStr) = (maybe "" (showTokens pxy) (NE.nonEmpty pre), maybe "" (showTokens pxy) (NE.nonEmpty post))
351 preLine = reverse . takeWhile (/= '\n') . reverse $ preStr
352 restOfLine = takeWhile (/= '\n') postStr
353
354 pxy :: Proxy ETokenStream
355 pxy = Proxy
356
357
358
359 -- instance Show ETok where
360 -- show (ETok _ _ _ q) = show q
361
362
363
364 -- instance TraversableStream ETokenStream where
365 -- reachOffset i s = (Nothing, s)
366 -- reachOffsetNoLine i s = s