clean up some more'
[cc1516.git] / lex.icl
1 implementation module lex
2
3 import Control.Monad, Control.Applicative
4 import Data.Either, Data.Func
5 from StdFunc import o
6 import StdBool
7 import StdList
8 import StdChar
9 import StdString
10
11 import yard
12
13 :: LexItem
14 = LexToken Int TokenValue
15 | LexSpace Int Int
16 | LexNL
17 | LexEOF
18 | LexItemError String
19
20 lexer :: [Char] -> LexerOutput
21 lexer r = case runParser (lexProgram 1 1) r of
22 (Right p, _) = Right p
23 (Left e, _) = Left e
24
25 lexProgram :: Int Int -> Parser Char [Token]
26 lexProgram line column = lexToken >>= \t->case t of
27 LexEOF = pure []
28 LexNL = lexProgram (line+1) 1
29 (LexSpace l c) = lexProgram (line+l) (column+c)
30 (LexItemError e) = fail <?>
31 PositionalError line column ("LexerError: " +++ e)
32 (LexToken c t) = lexProgram line (column+c)
33 >>= \rest->pure [{line=line, column=column, token=t}:rest]
34
35 lexToken :: Parser Char LexItem
36 lexToken =
37 //Comments
38 lexBlockComment <|> lexComment <|>
39 //Keyword tokens
40 lexKw "var" VarToken <|> lexKw "Void" VoidToken <|>
41 lexKw "return" ReturnToken <|> lexKw "if" IfToken <|>
42 lexKw "else" ElseToken <|> lexKw "while" WhileToken <|>
43 lexKw "True" TrueToken <|> lexKw "False" FalseToken <|>
44 lexKw "Int" IntTypeToken <|> lexKw "Bool" BoolTypeToken <|>
45 lexKw "Char" CharTypeToken <|>
46 //Character tokens
47 lexEscape <|> lexCharacter <|>
48 //Two char ops tokens
49 lexWord "::" DoubleColonToken <|> lexWord "!=" NotEqualToken <|>
50 lexWord "<=" LesserEqToken <|> lexWord ">=" GreaterEqToken <|>
51 lexWord "==" EqualsToken <|> lexWord "&&" AmpersandsToken <|>
52 lexWord "||" PipesToken <|> lexWord "[]" EmptyListToken <|>
53 lexWord "->" ArrowToken <|>
54 //One char ops tokens
55 lexWord "(" BraceOpenToken <|> lexWord ")" BraceCloseToken <|>
56 lexWord "{" CBraceOpenToken <|> lexWord "}" CBraceCloseToken <|>
57 lexWord "[" SquareOpenToken <|> lexWord "]" SquareCloseToken <|>
58 lexWord "," CommaToken <|> lexWord ":" ColonToken <|>
59 lexWord ";" SColonToken <|> lexWord "." DotToken <|>
60 lexWord "+" PlusToken <|> lexWord "*" StarToken <|>
61 lexWord "/" SlashToken <|> lexWord "%" PercentToken <|>
62 lexWord "=" AssignmentToken <|> lexWord "<" LesserToken <|>
63 lexWord ">" BiggerToken <|> lexWord "!" ExclamationToken <|>
64 lexWord "-" DashToken <|>
65 //Number and identifier tokens
66 lexNumber <|> lexIdentifier <|>
67 (item '\n' >>| pure LexNL) <|>
68 //Whitespace
69 (satisfy isSpace >>| (pure $ LexSpace 0 1)) <|>
70 (eof >>| pure LexEOF)
71 where
72 lexWord :: String TokenValue -> Parser Char LexItem
73 lexWord s tv = list ls >>| pure (LexToken (length ls) tv)
74 where ls = fromString s
75
76 lexKw :: String TokenValue -> Parser Char LexItem
77 lexKw kw tv = lexWord kw tv <* check (not o isIdentChar)
78
79 lexComment :: Parser Char LexItem
80 lexComment = list (fromString "//")
81 >>| top until (eof <|> (item '\n' >>| pure Void)) >>| pure LexNL
82
83 lexBlockComment :: Parser Char LexItem
84 lexBlockComment = list (fromString "/*")
85 >>| (top until (list (fromString "*/")))
86 >>= \chars->pure $ widthHeight chars 0 0
87 where
88 widthHeight :: [Char] Int Int -> LexItem
89 widthHeight [] l c = LexSpace l c
90 widthHeight ['\n':xs] l _ = widthHeight xs (l+1) 0
91 widthHeight [x:xs] l c = widthHeight xs l (c+1)
92
93 lexNumber :: Parser Char LexItem
94 lexNumber = some (satisfy isDigit) >>= \si->pure $
95 LexToken (length si) (NumberToken $ toInt $ toString si)
96
97 lexIdentifier :: Parser Char LexItem
98 lexIdentifier = some (satisfy isIdentChar)
99 >>= \si->pure $ LexToken (length si) (IdentToken $ toString si)
100
101 isIdentChar c = isAlphanum c || c == '_'
102
103 lexCharacter :: Parser Char LexItem
104 lexCharacter = item '\'' *> satisfy ((<>) '\'') <* item '\''
105 >>= \char->pure $ LexToken 3 (CharToken char)
106
107 lexEscape :: Parser Char LexItem
108 lexEscape = item '\'' *> item '\\' *> top <* item '\''
109 >>= \char->pure case char of
110 'a' = LexToken 4 (CharToken $ toChar 7)
111 'b' = LexToken 4 (CharToken '\b')
112 'b' = LexToken 4 (CharToken '\b')
113 'f' = LexToken 4 (CharToken '\f')
114 'n' = LexToken 4 (CharToken '\n')
115 'r' = LexToken 4 (CharToken '\t')
116 'v' = LexToken 4 (CharToken '\v')
117 '\'' =LexToken 4 (CharToken '\'')
118 c = (LexItemError $ "Unknown escape: " +++ toString c)