never executed always true always false
1 {-# LANGUAGE FlexibleInstances #-}
2 {-# LANGUAGE InstanceSigs #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE RecordWildCards #-}
5 {-# LANGUAGE TypeFamilies #-}
6 {-# LANGUAGE DeriveDataTypeable #-}
7 {-# LANGUAGE DeriveGeneric #-}
8 {-# LANGUAGE TypeFamilies #-}
9 {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
10
11 module Conjure.Language.Lexer
12 ( Lexeme(..)
13 , LexemePos(..)
14 , textToLexeme,
15 ETok(..),
16 Offsets(..),
17 Reformable(..),
18 prettySplitComments,
19 eLex,
20 reformList,
21 tokenSourcePos,
22 sourcePosAfter,
23 totalLength,
24 trueLength,
25 trueStart,
26 tokenOffset,
27 tokenStartOffset,
28 sourcePos0,
29 nullBefore,
30 LexerError(..),
31 runLexer,
32 ETokenStream(..)
33 , lexemeText
34 ) where
35
36 import Conjure.Language.Lexemes
37 import Conjure.Prelude hiding (many, some,Text)
38 import Data.Char (isAlpha, isAlphaNum)
39 import Data.Void
40
41 import qualified Data.Text as T
42 import qualified Data.Text.Lazy as L
43 import Text.Megaparsec hiding (State)
44 import Text.Megaparsec.Char
45
46 import Data.List (splitAt)
47 import qualified Data.List.NonEmpty as NE
48 import qualified Text.Megaparsec as L
49 import Prelude (read)
50 import qualified Prettyprinter as Pr
51 import Conjure.Prelude hiding (some,many)
52
53
54 import Text.Megaparsec.Stream ()
55
56
57 data LexemePos = LexemePos
58 Lexeme -- the lexeme
59 SourcePos -- source position, the beginning of this lexeme
60 SourcePos -- source position, just after this lexeme, including whitespace after the lexeme
61 deriving (Show,Eq, Ord)
62
63 sourcePos0 :: SourcePos
64 sourcePos0 = SourcePos "" (mkPos 1) (mkPos 1)
65
66 class Reformable a where
67 reform :: a -> L.Text
68
69 instance Reformable ETok where
70 reform e | totalLength e == 0 = ""
71 reform (ETok{capture=cap,trivia=triv}) = L.append (L.concat $ map showTrivia triv) (L.fromStrict cap)
72 where
73 showTrivia :: Trivia -> L.Text
74 showTrivia x = case x of
75 WhiteSpace txt -> L.fromStrict txt
76 LineComment txt -> L.fromStrict txt
77 BlockComment txt -> L.fromStrict txt
78
79 reformList :: (Traversable t ,Reformable a) => t a -> L.Text
80 reformList = L.concat . map reform . toList
81
82 emojis :: [Char]
83 emojis =
84 concat
85 [ ['\x1f600' .. '\x1F64F']
86 , ['\x1f300' .. '\x1f5ff']
87 , ['\x1f680' .. '\x1f999']
88 , ['\x1f1e0' .. '\x1f1ff']
89 ]
90
91 isIdentifierFirstLetter :: Char -> Bool
92 isIdentifierFirstLetter ch = isAlpha ch || ch `elem` ("_" :: String) || ch `elem` emojis
93
94 isIdentifierLetter :: Char -> Bool
95 isIdentifierLetter ch = isAlphaNum ch || ch `elem` ("_'" :: String) || ch `elem` emojis
96
97 data Offsets = Offsets {
98 oStart::Int, -- the starting offset of the token (including whitespace)
99 oTotalLength::Int, -- (the total length of the the token)
100 oTokenLength::Int, -- (the length of the token excluding trivia)
101 oTrueStart :: SourcePos, -- start pos of the token
102 oSourcePos::SourcePos, -- start pos of the lexeme
103 oEndPos::SourcePos}
104 deriving (Show, Eq, Ord , Data)
105 type Lexer = Parsec Void Text
106
107 -- type Lexer = Parsec Void Text ETokenStream
108
109 data Trivia = WhiteSpace Text | LineComment Text | BlockComment Text
110 deriving (Show, Eq, Ord , Data)
111
112 data ETok = ETok
113 { offsets :: Offsets
114 , trivia :: [Trivia]
115 , lexeme :: Lexeme
116 , capture :: Text
117 }
118 deriving (Eq, Ord,Show , Data)
119
120 instance Pr.Pretty ETok where
121 pretty = Pr.unAnnotate . uncurry (Pr.<>) . prettySplitComments
122
123 prettySplitComments :: ETok -> (Pr.Doc ann, Pr.Doc ann)
124 prettySplitComments (ETok _ tr _ capture) = (Pr.hcat [Pr.pretty t Pr.<> Pr.hardline | LineComment t <- tr],Pr.pretty capture)
125
126
127 totalLength :: ETok -> Int
128 totalLength = oTotalLength . offsets
129
130 trueLength :: ETok -> Int
131 trueLength = oTokenLength . offsets
132
133 -- tokenStart :: ETok -> Int
134 -- tokenStart (ETok{offsets = (Offsets _ s _ _ _)}) = s
135 tokenOffset :: ETok -> Int
136 tokenOffset = oStart . offsets
137 tokenStartOffset :: ETok -> Int
138 tokenStartOffset t = oStart o + (oTotalLength o - oTokenLength o)
139 where o = offsets t
140
141 trueStart :: ETok -> SourcePos
142 trueStart = oTrueStart . offsets
143
144 tokenSourcePos :: ETok -> SourcePos
145 tokenSourcePos = oSourcePos . offsets
146
147 sourcePosAfter :: ETok -> SourcePos
148 sourcePosAfter = oEndPos . offsets
149
150 makeToken :: Offsets -> [Trivia] -> Lexeme -> Text -> ETok
151 makeToken = ETok
152
153 --make an empty token that precedes the given token with the given lexeme
154 nullBefore :: Lexeme -> ETok -> ETok
155 nullBefore lex tok = ETok offs [] lex ""
156 where
157 sp = tokenSourcePos tok
158 offs = Offsets (tokenStartOffset tok) 0 0 sp sp sp
159 newtype LexerError = LexerError String
160 deriving (Show)
161
162 runLexer :: Text -> Maybe FilePath -> Either LexerError ETokenStream
163 runLexer txt fp = case runParser eLex (fromMaybe "Lexer" fp) txt of
164 Left peb -> Left $ LexerError $ errorBundlePretty peb
165 Right ets -> Right $ ETokenStream txt ets
166
167
168 eLex :: Lexer [ETok]
169 eLex =
170 do
171 main <- many $ try aToken
172 end <- pEOF
173 return $ main ++ [end]
174
175 aToken :: Lexer ETok
176 aToken = do
177 start <- getOffset
178 startPos <- getSourcePos
179 whitespace <- pTrivia
180 tokenOffset_ <- getOffset
181 tokenStart <- getSourcePos
182 (tok,cap) <- aLexeme
183 tokenEnd <- getOffset
184 endPos <- getSourcePos
185 return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - tokenOffset_) startPos tokenStart endPos) whitespace tok cap
186
187 pEOF :: Lexer ETok
188 pEOF = do
189 start <- getOffset
190 startPos <- getSourcePos
191 whitespace <- pTrivia
192 wse <- getOffset
193 tokenStart <- getSourcePos
194 eof
195 tokenEnd <- getOffset
196 endPos <- getSourcePos
197 return $ makeToken (Offsets start (tokenEnd - start) (tokenEnd - wse) startPos tokenStart endPos) whitespace L_EOF ""
198
199
200 aLexeme :: Lexer (Lexeme,Text)
201 aLexeme = aLexemeStrict <|> pFallback
202
203 aLexemeStrict :: Lexer (Lexeme,Text)
204 aLexemeStrict =
205 try
206 pNumber
207 <|> try (choice (map pLexeme lexemes) <?> "Lexeme")
208 <|> try pIdentifier
209 <|> try pQuotedIdentifier
210 <|> try pMetaVar
211
212
213 pNumber :: Lexer (Lexeme,Text)
214 pNumber = do
215 v <- takeWhile1P Nothing (`elem` ['1','2','3','4','5','6','7','8','9','0'])
216 let n = read $ T.unpack v
217 return (LIntLiteral n,v)
218 <?> "Numeric Literal"
219
220 pMetaVar :: Lexer (Lexeme,Text)
221 pMetaVar = do
222 amp <- chunk "&"
223 (_,cap) <- pIdentifier
224 return (LMetaVar cap,amp `T.append` cap)
225
226 pIdentifier :: Lexer (Lexeme,Text)
227 pIdentifier = do
228 firstLetter <- takeWhile1P Nothing isIdentifierFirstLetter
229 rest <- takeWhileP Nothing isIdentifierLetter
230 let ident = T.append firstLetter rest
231 -- traceM $ T.unpack . T.pack $ map chr $ map ord $ T.unpack ident
232 return ( LIdentifier ident, ident)
233 <?> "Identifier"
234
235 pQuotedIdentifier :: Lexer (Lexeme,Text)
236 pQuotedIdentifier = do
237 l <- quoted
238 return (LIdentifier l,l)
239
240 pFallback :: Lexer (Lexeme,Text)
241 pFallback = do
242 q <- T.pack <$> someTill anySingle (lookAhead $ try somethingValid)
243 return (LUnexpected q,q)
244 where
245 somethingValid :: Lexer ()
246 somethingValid = void pTrivia <|> void aLexemeStrict <|> eof
247
248 pLexeme :: (Text, Lexeme) -> Lexer (Lexeme,Text)
249 pLexeme (s, l) = do
250 tok <- string s
251 notFollowedBy $ if isIdentifierLetter $ T.last tok then nonIden else empty
252 return (l,tok)
253 <?> "Lexeme :" ++ show l
254 where
255 nonIden = takeWhile1P Nothing isIdentifierLetter
256
257 pTrivia :: Lexer [Trivia]
258 pTrivia = many (whiteSpace <|> lineComment <|> blockComment)
259
260 whiteSpace :: Lexer Trivia
261 whiteSpace = do
262 s <- some spaceChar
263 return $ WhiteSpace $ T.pack s
264
265 quoted :: Lexer Text
266 quoted = do
267 open <- char '\"'
268 (body,end) <- manyTill_ anySingle $ char '\"'
269 return $ T.pack $ open:body++[end]
270
271 lineEnd :: Lexer [Char]
272 lineEnd = T.unpack <$> eol <|> ( eof >> return [])
273
274 lineComment :: Lexer Trivia
275 lineComment = do
276 _ <- try (chunk "$")
277 (text,end) <- manyTill_ anySingle lineEnd
278 return $ LineComment $ T.pack ('$' : text++end)
279
280 blockComment :: Lexer Trivia
281 blockComment = do
282 _ <- try (chunk "/*")
283 text <- manyTill L.anySingle (lookAhead (void(chunk "*/") <|>eof))
284 cl <- optional $ chunk "*/"
285 let cl' = fromMaybe "" cl
286 return $ BlockComment $ T.concat ["/*",T.pack text ,cl' ]
287
288
289 data ETokenStream = ETokenStream
290 { streamSourceText :: Text
291 , streamTokens :: [ETok]
292 }
293 instance Stream ETokenStream where
294 type Token ETokenStream = ETok
295 type Tokens ETokenStream = [ETok]
296 tokenToChunk _ x = [x]
297 tokensToChunk _ xs = xs
298 chunkToTokens _ = id
299 chunkLength _ = length
300 chunkEmpty _ [] = True
301 chunkEmpty _ _ = False
302 take1_ :: ETokenStream -> Maybe (Token ETokenStream, ETokenStream)
303 take1_ (ETokenStream _ (x : xs)) = Just (x, buildStream xs)
304 take1_ (ETokenStream _ []) = Nothing
305 takeN_ :: Int -> ETokenStream -> Maybe (Tokens ETokenStream, ETokenStream)
306 takeN_ n xs | n <= 0 = Just ([], xs)
307 takeN_ _ (ETokenStream _ []) = Nothing
308 takeN_ n (ETokenStream _ xs) = Just (take n xs, buildStream $ drop n xs)
309 takeWhile_ :: (Token ETokenStream -> Bool) -> ETokenStream -> (Tokens ETokenStream, ETokenStream)
310 takeWhile_ p (ETokenStream _ xs) =
311 (a, buildStream b)
312 where
313 (a, b) = span p xs
314
315 -- (takeWhile p xs,ETokenStream $ dropWhile p xs)
316
317 buildStream :: [ETok] -> ETokenStream
318 buildStream xs = case NE.nonEmpty xs of
319 Nothing -> ETokenStream "" xs
320 Just _ -> ETokenStream (T.pack "showTokens pxy s") xs
321
322 instance VisualStream ETokenStream where
323 showTokens _ = L.unpack . reformList
324 tokensLength _ = sum . fmap trueLength
325
326 -- https://markkarpov.com/tutorial/megaparsec.html#working-with-custom-input-streams
327 instance TraversableStream ETokenStream where
328 reachOffset o PosState{..} =
329 ( Just (prefix ++ restOfLine)
330 , PosState
331 { pstateInput = buildStream post
332 , pstateOffset = max pstateOffset o
333 , pstateSourcePos = newSourcePos
334 , pstateTabWidth = pstateTabWidth
335 , pstateLinePrefix = prefix
336 }
337 )
338 where
339 prefix =
340 if sameLine
341 then pstateLinePrefix ++ preLine
342 else preLine
343 sameLine = sourceLine newSourcePos == sourceLine pstateSourcePos
344 newSourcePos =
345 case post of
346 [] -> pstateSourcePos
347 (x : _) -> tokenSourcePos x
348 (pre, post) :: ([ETok], [ETok]) = splitAt (o - pstateOffset) (streamTokens pstateInput)
349 (preStr, postStr) = (maybe "" (showTokens pxy) (NE.nonEmpty pre), maybe "" (showTokens pxy) (NE.nonEmpty post))
350 preLine = reverse . takeWhile (/= '\n') . reverse $ preStr
351 restOfLine = takeWhile (/= '\n') postStr
352
353 pxy :: Proxy ETokenStream
354 pxy = Proxy
355
356
357
358 -- instance Show ETok where
359 -- show (ETok _ _ _ q) = show q
360
361
362
363 -- instance TraversableStream ETokenStream where
364 -- reachOffset i s = (Nothing, s)
365 -- reachOffsetNoLine i s = s