X-Git-Url: https://git.martlubbers.net/?a=blobdiff_plain;f=lex.icl;h=aa34987969b5afcdc71c83451f91fe448ee9902f;hb=f1a53c1d5200e6122756e1b52b1cd4b58282968e;hp=75fab23c24aa48d8bc1120a32d83bc52dc6dc514;hpb=8789e58ffd30bae4181069aed5c30b82959961aa;p=cc1516.git diff --git a/lex.icl b/lex.icl index 75fab23..aa34987 100644 --- a/lex.icl +++ b/lex.icl @@ -1,263 +1,118 @@ implementation module lex -import Data.Either -import Data.List +import Control.Monad, Control.Applicative +import Data.Either, Data.Func, Data.Void +from StdFunc import o +import StdBool +import StdList +import StdChar import StdString -import System.CommandLine -import StdFile -import StdMisc -from StdFunc import id, const -import Data.Maybe -import Control.Applicative -import Control.Monad -import Control.Monad.State -import Data.Functor -from Data.Func import $ -from Text import class Text, instance Text String -import qualified Text +import StdTuple -lexer :: [Char] -> LexerOutput [Token] -lexer _ = undef +import yard +import AST -// Misschien moeten we hier continuation style van maken -instance toString (LexerOutput [Token]) where - toString (Left l) = "Error: " +++ l - toString (Right x) = 'Text'.concat (print 0 x) +:: LexItem + = LexToken Int TokenValue + | LexSpace Int Int + | LexNL + | LexEOF + | LexItemError String -print :: Int [Token] -> [String] -print i [(IdentToken l):rest] = [tab i:toString l:print i rest] -print i [(NumberToken j):rest] = [tab i:toString j:print i rest] -print i [(CharToken c):rest] = [tab i:"'":toString c:"'":print i rest] -print i [VarToken:rest] = [tab i:"var":print i rest] -print i [VoidToken:rest] = [tab i:"Void":print i rest] -print i [ReturnToken:rest] = [tab i:"return":print i rest] -print i [IfToken:rest] = [tab i:"if":print i rest] -print i [ElseToken:rest] = [tab i:"else":print i rest] -print i [WhileToken:rest] = [tab i:"while":print i rest] -print i [TrueToken:rest] = [tab i:"True":print i rest] -print i [FalseToken:rest] = [tab i:"False":print i rest] -print i [BraceOpenToken:rest] = [tab i:"(":print i rest] -print i [BraceCloseToken:rest] = [tab i:")":print i rest] -print i [CBraceOpenToken:rest] = [tab i:"{\n":print (i+1) rest] -print i [CBraceCloseToken:rest] = [tab i:"}\n":print (i-1) rest] -print i [SquareOpenToken:rest] = [tab i:"[":print i rest] -print i [SquareCloseToken:rest] = [tab i:"]":print i rest] -print i [CommaToken:rest] = [tab i:", ":print i rest] -print i [ColonToken:rest] = [tab i:":":print i rest] -print i [SColonToken:rest] = [tab i:";\n":print i rest] -print i [DotToken:rest] = [tab i:".":print i rest] -print i [PlusToken:rest] = [tab i:" + ":print i rest] -print i [DashToken:rest] = [tab i:" - ":print i rest] -print i [StarToken:rest] = [tab i:" * ":print i rest] -print i [SlashToken:rest] = [tab i:" / ":print i rest] -print i [PercentToken:rest] = [tab i:" % ":print i rest] -print i [AssignmentToken:rest] = [tab i:" = ":print i rest] -print i [LesserToken:rest] = [tab i:" < ":print i rest] -print i [BiggerToken:rest] = [tab i:" > ":print i rest] -print i [ExclamationToken:rest] = [tab i:"!":print i rest] -print i [DoubleColonToken:rest] = [tab i:" :: ":print i rest] -print i [LesserEqToken:rest] = [tab i:" <= ":print i rest] -print i [GreaterEqToken:rest] = [tab i:" >= ":print i rest] -print i [EqualsToken:rest] = [tab i:" == ":print i rest] -print i [AmpersandsToken:rest] = [tab i:" && ":print i rest] -print i [PipesToken:rest] = [tab i:" || ":print i rest] -print i [ArrowToken:rest] = [tab i:" -> ":print i rest] +lexer :: [Char] -> LexerOutput +lexer r = fst $ runParser (lexProgram 1 1) r -tab :: Int -> String -tab 0 = "" -tab i = "\t" +++ tab (i-1) +lexProgram :: Int Int -> Parser Char [Token] +lexProgram line column = lexToken >>= \t->case t of + LexEOF = pure [] + LexNL = lexProgram (line+1) 1 + (LexSpace l c) = lexProgram (line+l) (column+c) + (LexItemError e) = fail + PositionalError line column ("LexerError: " +++ e) + (LexToken c t) = lexProgram line (column+c) + >>= \rest->pure [({line=line,col=column}, t):rest] -lex :: [Char] -> LexerOutput [Token] -lex ['v':'a':'r':rest] = undef -lex ['V':'o':'i':'d':rest] = undef -lex ['r':'e':'t':'u':'r':'n':rest] = undef -lex ['i':'f':rest] = undef -lex ['e':'l':'s':'e':rest] = undef -lex ['w':'h':'i':'l':'e':rest] = undef -lex ['T':'r':'u':'e':rest] = undef -lex ['F':'a':'l':'s':'e':rest] = undef -lex [':':':':rest] = undef -lex ['<':'=':rest] = undef -lex ['>':'=':rest] = undef -lex ['=':'=':rest] = undef -lex ['&':'&':rest] = undef -lex ['|':'|':rest] = undef -lex ['-':'>':rest] = undef -lex ['(':rest] = undef -lex [')':rest] = undef -lex ['{':rest] = undef -lex ['{':rest] = undef -lex ['[':rest] = undef -lex [':rest]':rest] = undef -lex [',':rest] = undef -lex [':':rest] = undef -lex [';':rest] = undef -lex ['.':rest] = undef -lex ['+':rest] = undef -lex ['-':rest] = undef -lex ['*':rest] = undef -lex ['/':rest] = undef -lex ['%':rest] = undef -lex ['=':rest] = undef -lex ['<':rest] = undef -lex ['>':rest] = undef -lex ['!':rest] = undef -lex ['\'':'\\':x'\'':rest] = case x of - 'a' = undef // (CharToken '\a') - 'b' = undef // (CharToken '\b') - 'f' = undef // (CharToken '\f') - 'n' = undef // (CharToken '\n') - 'r' = undef // (CharToken '\r') - 't' = undef // (CharToken '\t') - 'v' = undef // (CharToken '\v') - _ = Left ("Illegal escape: \\" +++ x) -lex ['\'':x:'\'':rest] = undef -lex [x:xs] -| isSpace x = lex xs -| isDigit x = undef //Parse Int -| isAlpha x = undef //Parse ident -| otherwise = Left ("Unexpected character: " +++ toString x) +lexToken :: Parser Char LexItem +lexToken = + //Comments + lexBlockComment <|> lexComment <|> + //Keyword tokens + lexKw "var" VarToken <|> lexKw "Void" VoidToken <|> + lexKw "return" ReturnToken <|> lexKw "if" IfToken <|> + lexKw "else" ElseToken <|> lexKw "while" WhileToken <|> + lexKw "True" TrueToken <|> lexKw "False" FalseToken <|> + lexKw "Int" IntTypeToken <|> lexKw "Bool" BoolTypeToken <|> + lexKw "Char" CharTypeToken <|> + //Character tokens + lexEscape <|> lexCharacter <|> + //Two char ops tokens + lexWord "::" DoubleColonToken <|> lexWord "!=" NotEqualToken <|> + lexWord "<=" LesserEqToken <|> lexWord ">=" GreaterEqToken <|> + lexWord "==" EqualsToken <|> lexWord "&&" AmpersandsToken <|> + lexWord "||" PipesToken <|> lexWord "[]" EmptyListToken <|> + lexWord "->" ArrowToken <|> + //One char ops tokens + lexWord "(" BraceOpenToken <|> lexWord ")" BraceCloseToken <|> + lexWord "{" CBraceOpenToken <|> lexWord "}" CBraceCloseToken <|> + lexWord "[" SquareOpenToken <|> lexWord "]" SquareCloseToken <|> + lexWord "," CommaToken <|> lexWord ":" ColonToken <|> + lexWord ";" SColonToken <|> lexWord "." DotToken <|> + lexWord "+" PlusToken <|> lexWord "*" StarToken <|> + lexWord "/" SlashToken <|> lexWord "%" PercentToken <|> + lexWord "=" AssignmentToken <|> lexWord "<" LesserToken <|> + lexWord ">" BiggerToken <|> lexWord "!" ExclamationToken <|> + lexWord "-" DashToken <|> + //Number and identifier tokens + lexNumber <|> lexIdentifier <|> + (item '\n' >>| pure LexNL) <|> + //Whitespace + (satisfy isSpace >>| (pure $ LexSpace 0 1)) <|> + (eof >>| pure LexEOF) + where + lexWord :: String TokenValue -> Parser Char LexItem + lexWord s tv = list ls >>| pure (LexToken (length ls) tv) + where ls = fromString s -Start = "Hi" + lexKw :: String TokenValue -> Parser Char LexItem + lexKw kw tv = lexWord kw tv <* check (not o isIdentChar) -//:: LexerOutput a :== Either String a -// -// -//runParser :: (Parser a) [Char] -> (LexerOutput a, [Char]) -//runParser (Parser p) s = p s -// -//lexer :: [Char] -> LexerOutput [Token] -// -////lexer functions -//identT = alpha >>= \a -> many (char '_' <|> alphaNum) >>= \as -> return $ IdentToken [a:as] -//numberT = optional (char '-') >>= \sign -> (some digit) >>= \n -> case sign of -// Nothing -> return $ NumberToken $ 5 //fromString n -// _ -> return $ NumberToken $ -5 //(fromString n) * -1 -//charLT = CharToken <$> (char '\'' *> item <* char '\'') -//char2T = item >>= \c1 -> case c1 of -// ':' = char ':' >>| return DoubleColonToken -// '<' = char '=' >>| return LesserEqToken -// '>' = char '=' >>| return GreaterEqToken -// '=' = char '=' >>| return EqualsToken -// '&' = char '&' >>| return AmpersandsToken -// '|' = char '|' >>| return PipesToken -// '-' = char '>' >>| return ArrowToken -// _ = zero -//char1T = item >>= \c1 -> findT c1 charTokenMap -//varT = string (fromString "var") >>| return VarToken -//voidT = string (fromString "Void") >>| return VoidToken -//returnT = string (fromString "return") >>| return ReturnToken -//ifT = string (fromString "if") >>| return IfToken -//elseT = string (fromString "else") >>| return ElseToken -//whileT = string (fromString "while") >>| return WhileToken -//trueT = string (fromString "True") >>| return TrueToken -//falseT = string (fromString "False") >>| return FalseToken -////note, for the anyToken parser the order matters! -//anyT = char2T <|> char1T <|> varT <|> voidT <|> returnT <|> ifT <|> elseT <|> whileT <|> -// trueT <|> falseT <|> numberT <|> identT <|> charLT -// -//Start :: *World -> *World -//Start w -//# (args, w) = getCommandLine w // We lezen nu nog standaard van stdin -//# (out, w) = stdio w -//# (toparse, out) = readEntireFile out -//# out = out <<< toString (lexer toparse) -//# (b, w) = fclose out w -//| not b = setReturnCode 1 w -//= w -// where -// readEntireFile :: *File -> *([Char], *File) -// readEntireFile f -// # (b, c, f) = freadc f -// | not b = ([], f) -// # (cs, f) = readEntireFile f -// = ([c:cs], f) -// -// -// -//charTokenMap = [('(', BraceOpenToken) -// ,(')', BraceCloseToken) -// ,('{', CBraceOpenToken) -// ,('}', CBraceCloseToken) -// ,('[', SquareOpenToken) -// ,(']', SquareCloseToken) -// ,(',', CommaToken) -// ,(':', ColonToken) -// ,(';', SColonToken) -// ,('.', DotToken) -// ,('+', PlusToken) -// ,('-', DashToken) -// ,('*', StarToken) -// ,('/', SlashToken) -// ,('%', PercentToken) -// ,('=', AssignmentToken) -// ,('<', LesserToken) -// ,('>', BiggerToken) -// ,('!', ExclamationToken)] -//findT c [] = fail "Unrecognized character" -//findT c [(k,v):xs] = if (c==k) (return v) (findT c xs) -// -// -//// Clean adaption of Yard, a parsec like parser combinator -//:: Parser a = Parser ([Char] -> (LexerOutput a, [Char])) -// -//runParser :: (Parser a) [Char] -> (LexerOutput a, [Char]) -//runParser (Parser p) s = p s -// -//instance Functor Parser where -// fmap f s = liftM f s -// -//instance Applicative Parser where -// pure a = Parser $ \s -> (Right a, s) -// (<*>) sf s = ap sf s -// -//instance Monad Parser where -// bind p f = Parser $ \s -> let (out, rest) = runParser p s in case out of -// Left e = (Left e, rest) -// Right t = runParser (f t) rest -// -////gives us some, many and optional -//instance Alternative Parser where -// empty = zero -// (<|>) p1 p2 = Parser $ \s -> let (out, rest) = runParser p1 s in case out of -// Left e = runParser p2 s -// Right t = (Right t, rest) -// -////parser that fails with error -//fail :: String -> Parser a -//fail e = Parser $ \s -> (Left e, s) -// -////parser that always fails -//zero :: Parser a -//zero = fail "Zero parser" -// -////matches exactly one Char -//item :: Parser Char -//item = Parser $ \s -> case s of -// [] = (Left "Unexpected empty input", s) -// [x:xs] = (Right x, xs) -// -////matches any char which satisfies f -//satisfy :: (Char -> Bool) -> Parser Char -//satisfy f = item >>= (\r -> if (f r) (return r) zero) -// -////tries a parser, if it fails returns a default value -//optionalDef :: a (Parser a) -> Parser a -//optionalDef def p = p <|> return def -// -////matched given char -//char :: Char -> Parser Char -//char c = satisfy (\i -> c==i) //I hate that we can't do: satisfy (==c) -// -//alpha :: Parser Char -//alpha = satisfy isAlpha -// -//digit :: Parser Char -//digit = satisfy isDigit -// -//alphaNum :: Parser Char -//alphaNum = alpha <|> digit -// -////matches a given String -//string :: [Char] -> Parser [Char] -//string s = mapM_ char s >>| return s + lexComment :: Parser Char LexItem + lexComment = list (fromString "//") + >>| top until (eof <|> (item '\n' >>| pure Void)) >>| pure LexNL + + lexBlockComment :: Parser Char LexItem + lexBlockComment = list (fromString "/*") + >>| (top until (list (fromString "*/"))) + >>= \chars->pure $ widthHeight chars 0 0 + where + widthHeight :: [Char] Int Int -> LexItem + widthHeight [] l c = LexSpace l c + widthHeight ['\n':xs] l _ = widthHeight xs (l+1) 0 + widthHeight [x:xs] l c = widthHeight xs l (c+1) + + lexNumber :: Parser Char LexItem + lexNumber = some (satisfy isDigit) >>= \si->pure $ + LexToken (length si) (NumberToken $ toInt $ toString si) + + lexIdentifier :: Parser Char LexItem + lexIdentifier = some (satisfy isIdentChar) + >>= \si->pure $ LexToken (length si) (IdentToken $ toString si) + + isIdentChar c = isAlphanum c || c == '_' + + lexCharacter :: Parser Char LexItem + lexCharacter = item '\'' *> satisfy ((<>) '\'') <* item '\'' + >>= \char->pure $ LexToken 3 (CharToken char) + + lexEscape :: Parser Char LexItem + lexEscape = item '\'' *> item '\\' *> top <* item '\'' + >>= \char->pure case char of + 'a' = LexToken 4 (CharToken $ toChar 7) + 'b' = LexToken 4 (CharToken '\b') + 'b' = LexToken 4 (CharToken '\b') + 'f' = LexToken 4 (CharToken '\f') + 'n' = LexToken 4 (CharToken '\n') + 'r' = LexToken 4 (CharToken '\t') + 'v' = LexToken 4 (CharToken '\v') + '\'' =LexToken 4 (CharToken '\'') + c = (LexItemError $ "Unknown escape: " +++ toString c)