{-
    BNF Converter: TreeSitter Grammar Generator
    Copyright (C) 2004  Author:  Markus Forsberg, Michael Pellauer,
                                 Bjorn Bringert

    Description   : This module converts BNFC grammar to the contents of a
                    tree-sitter grammar.js file

    Author        : Kangjing Huang (huangkangjing@gmail.com)
    Created       : 08 Nov, 2023

-}

module BNFC.Backend.TreeSitter.CFtoTreeSitter where

import BNFC.Abs (Reg)
import BNFC.Backend.TreeSitter.RegToJSReg
import BNFC.Backend.TreeSitter.MatchesEmpty(fixPointKnownEmpty, transformEmptyMatches, KnownEmpty, OptSym(..), OptSentForm, isKnownEmpty)
import BNFC.CF
import BNFC.Utils(when, applyWhen, cstring)
import BNFC.Lexing (mkLexer, LexType(..))
import BNFC.PrettyPrint

import Prelude hiding ((<>))

import qualified Data.List as List
import qualified Data.Maybe as Maybe
import qualified Data.Either as Either
import qualified Data.List.NonEmpty as List1

-- | Indent one level of 2 spaces
indent :: Doc -> Doc
indent :: Doc -> Doc
indent = Int -> Doc -> Doc
nest Int
2

-- | Create content of grammar.js file
cfToTreeSitter :: String -> Cat -> CF -> Doc
cfToTreeSitter :: [Char] -> Cat -> CF -> Doc
cfToTreeSitter [Char]
name Cat
wordCat CF
cf =
  -- Overall structure of grammar.js
  [Char] -> Doc
text [Char]
"module.exports = grammar({"
    Doc -> Doc -> Doc
$+$ Doc -> Doc
indent
      ( [Char] -> Doc
text [Char]
"name:" Doc -> Doc -> Doc
<+> [Char] -> Doc
cstring [Char]
name Doc -> Doc -> Doc
<> Doc
","
          Doc -> Doc -> Doc
$+$ Doc
extrasSection
          Doc -> Doc -> Doc
$+$ Doc
wordSection
          Doc -> Doc -> Doc
$+$ Doc
rulesSection
      )
    Doc -> Doc -> Doc
$+$ [Char] -> Doc
text [Char]
"});"
  where
    ([Reg]
commentTokens, [(Reg, [Char])]
lexTokens) =
      [Either Reg (Reg, [Char])] -> ([Reg], [(Reg, [Char])])
forall a b. [Either a b] -> ([a], [b])
Either.partitionEithers ([Either Reg (Reg, [Char])] -> ([Reg], [(Reg, [Char])]))
-> [Either Reg (Reg, [Char])] -> ([Reg], [(Reg, [Char])])
forall a b. (a -> b) -> a -> b
$ ((Reg, LexType) -> Maybe (Either Reg (Reg, [Char])))
-> [(Reg, LexType)] -> [Either Reg (Reg, [Char])]
forall a b. (a -> Maybe b) -> [a] -> [b]
Maybe.mapMaybe (Reg, LexType) -> Maybe (Either Reg (Reg, [Char]))
forall {a}. (a, LexType) -> Maybe (Either a (a, [Char]))
tokenFilter ([(Reg, LexType)] -> [Either Reg (Reg, [Char])])
-> [(Reg, LexType)] -> [Either Reg (Reg, [Char])]
forall a b. (a -> b) -> a -> b
$ CF -> [(Reg, LexType)]
mkLexer CF
cf

    tokenFilter :: (a, LexType) -> Maybe (Either a (a, [Char]))
tokenFilter (a
r, LexType
LexComment) = Either a (a, [Char]) -> Maybe (Either a (a, [Char]))
forall a. a -> Maybe a
Just (a -> Either a (a, [Char])
forall a b. a -> Either a b
Left a
r)
    tokenFilter (a
r, LexToken [Char]
name) = Either a (a, [Char]) -> Maybe (Either a (a, [Char]))
forall a. a -> Maybe a
Just ((a, [Char]) -> Either a (a, [Char])
forall a b. b -> Either a b
Right (a
r, [Char]
name))
    tokenFilter (a
_, LexType
LexSymbols) = Maybe (Either a (a, [Char]))
forall a. Maybe a
Nothing

    extrasSection :: Doc
extrasSection = [Reg] -> Doc
prExtras [Reg]
commentTokens
    wordSection :: Doc
wordSection = Cat -> CF -> Doc
prWord Cat
wordCat CF
cf
    rulesSection :: Doc
rulesSection =
      [Char] -> Doc
text [Char]
"rules: {"
        Doc -> Doc -> Doc
$+$ Doc -> Doc
indent
          ( CF -> Doc
prRules CF
cf
              Doc -> Doc -> Doc
$+$ CF -> [(Reg, [Char])] -> Doc
prTokenRules CF
cf [(Reg, [Char])]
lexTokens
          )
        Doc -> Doc -> Doc
$+$ [Char] -> Doc
text [Char]
"},"

-- | Print rules for comments
prExtras :: [Reg] -> Doc
prExtras :: [Reg] -> Doc
prExtras [Reg]
commentRegs =
  [Char] -> Doc
defineSymbol [Char]
"extras" Doc -> Doc -> Doc
<> Doc
"["
    Doc -> Doc -> Doc
$+$ Doc -> Doc
indent
      ( -- default rule for white spaces
        [Char] -> Doc
text [Char]
"/\\s/,"
          Doc -> Doc -> Doc
$+$ [Doc] -> Doc
vcat' [Doc]
commentDocs
      )
    Doc -> Doc -> Doc
$+$ [Char] -> Doc
text [Char]
"],"
  where
    commentDocs :: [Doc]
commentDocs = (Reg -> Doc) -> [Reg] -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map (Doc -> Doc
appendComma (Doc -> Doc) -> (Reg -> Doc) -> Reg -> Doc
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Doc
text ([Char] -> Doc) -> (Reg -> [Char]) -> Reg -> Doc
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Reg -> [Char]
printRegJSReg) [Reg]
commentRegs

-- | Print word section, this section is needed for tree-sitter
--   to do keyword extraction before any parsing/lexing, see
--   https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction
--   TODO: currently, we just add every user defined token as well
--   as the predefined Ident token to this list to be safe. Ideally,
--   we should enumerate all defined tokens against all occurrences of
--   keywords. Any tokens patterns that could accept a keyword will go
--   into this list. This will require integration of a regex engine.
prWord :: Cat -> CF -> Doc
prWord :: Cat -> CF -> Doc
prWord Cat
wordCat CF
cf =
  Bool -> Doc -> Doc
forall m. Monoid m => Bool -> m -> m
when (CF -> Cat -> Bool
forall f. CFG f -> Cat -> Bool
isUsedCat CF
cf Cat
wordCat) (Doc -> Doc) -> Doc -> Doc
forall a b. (a -> b) -> a -> b
$
    [Char] -> Doc
defineSymbol [Char]
"word"
      Doc -> Doc -> Doc
<+> OptSentForm -> Doc
formatSent [Sym -> OptSym
NonOptional (Cat -> Sym
forall a b. a -> Either a b
Left Cat
wordCat)] Doc -> Doc -> Doc
<> Doc
","

-- | Prints the rules in the grammar with the entry point first.
--
-- Since Treesitter requires a unique entry point, this will build a "virtual"
-- entry point which dispatches to each of the declared BNFC entry points via
-- a choice list. Additionally, the virtual entry point can be marked optional
-- (and is the only rule which can be).
prRules :: CF -> Doc
prRules :: CF -> Doc
prRules CF
cf =
  KnownEmpty -> Bool -> Cat -> [Rule] -> Doc
prOneCat KnownEmpty
knownEmpty Bool
True Cat
virtEntryCat [Rule]
virtEntryRhsRules
    Doc -> Doc -> Doc
$+$ [Doc] -> Doc
vcat' (((Cat, [Rule]) -> Doc) -> [(Cat, [Rule])] -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map ((Cat -> [Rule] -> Doc) -> (Cat, [Rule]) -> Doc
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry (KnownEmpty -> Bool -> Cat -> [Rule] -> Doc
prOneCat KnownEmpty
knownEmpty Bool
False)) [(Cat, [Rule])]
groups)
  where
    groups :: [(Cat, [Rule])]
groups = CF -> [(Cat, [Rule])]
ruleGroups CF
cf

    virtEntryCat :: Cat
virtEntryCat = [Char] -> Cat
Cat [Char]
"BNFCStart"
    virtEntryRhsCats :: [Cat]
virtEntryRhsCats =
      (if CF -> Bool
forall f. CFG f -> Bool
hasEntryPoint CF
cf then NonEmpty Cat -> [Cat]
forall a. NonEmpty a -> [a]
List1.toList else Int -> NonEmpty Cat -> [Cat]
forall a. Int -> NonEmpty a -> [a]
List1.take Int
1)
      (CF -> NonEmpty Cat
forall f. CFG f -> NonEmpty Cat
allEntryPoints CF
cf)
    virtEntryRhsRules :: [Rule]
virtEntryRhsRules = Cat -> Rule
toVirtRule (Cat -> Rule) -> [Cat] -> [Rule]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Cat]
virtEntryRhsCats

    toVirtRule :: Cat -> Rule
toVirtRule Cat
rhsCat =
      [Char] -> Cat -> SentForm -> InternalRule -> Rule
npRule
        (Cat -> [Char]
identCat Cat
virtEntryCat [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
"_" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Cat -> [Char]
identCat Cat
rhsCat)
        Cat
virtEntryCat
        [Cat -> Sym
forall a b. a -> Either a b
Left Cat
rhsCat]
        InternalRule
Parsable

    knownEmpty :: KnownEmpty
knownEmpty = [(Cat, [Rule])] -> KnownEmpty
fixPointKnownEmpty ((Cat
virtEntryCat, [Rule]
virtEntryRhsRules) (Cat, [Rule]) -> [(Cat, [Rule])] -> [(Cat, [Rule])]
forall a. a -> [a] -> [a]
: [(Cat, [Rule])]
groups)

prTokenRules :: CF -> [(Reg, TokenCat)] -> Doc
prTokenRules :: CF -> [(Reg, [Char])] -> Doc
prTokenRules CF
cf [(Reg, [Char])]
lexTokens = [Doc] -> Doc
vcat' (((Reg, [Char]) -> Doc) -> [(Reg, [Char])] -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map (Reg, [Char]) -> Doc
prOneToken [(Reg, [Char])]
usedTokens)
  where
    usedTokens :: [(Reg, [Char])]
usedTokens = ((Reg, [Char]) -> Bool) -> [(Reg, [Char])] -> [(Reg, [Char])]
forall a. (a -> Bool) -> [a] -> [a]
filter (CF -> Cat -> Bool
forall f. CFG f -> Cat -> Bool
isUsedCat CF
cf (Cat -> Bool) -> ((Reg, [Char]) -> Cat) -> (Reg, [Char]) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Cat
TokenCat ([Char] -> Cat)
-> ((Reg, [Char]) -> [Char]) -> (Reg, [Char]) -> Cat
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Reg, [Char]) -> [Char]
forall a b. (a, b) -> b
snd) [(Reg, [Char])]
lexTokens

-- | Generate one tree-sitter rule for one terminal token.
prOneToken :: (Reg, TokenCat) -> Doc
prOneToken :: (Reg, [Char]) -> Doc
prOneToken (Reg
reg, [Char]
name) =
  [Char] -> Doc
defineSymbol (Bool -> Cat -> [Char]
formatCatName Bool
False (Cat -> [Char]) -> Cat -> [Char]
forall a b. (a -> b) -> a -> b
$ [Char] -> Cat
TokenCat [Char]
name)
    Doc -> Doc -> Doc
$+$ Doc -> Doc
indent ([Char] -> Doc
text ([Char] -> Doc) -> [Char] -> Doc
forall a b. (a -> b) -> a -> b
$ Reg -> [Char]
printRegJSReg Reg
reg) Doc -> Doc -> Doc
<> Doc
","

-- | Generates one tree-sitter rule for one non-terminal from CF.
prOneCat :: KnownEmpty -> Bool -> NonTerminal -> [Rule] -> Doc
prOneCat :: KnownEmpty -> Bool -> Cat -> [Rule] -> Doc
prOneCat KnownEmpty
knownEmpty Bool
allowEmpty Cat
nt [Rule]
rules =
  [Char] -> Doc
defineSymbol (Bool -> Cat -> [Char]
formatCatName Bool
False Cat
nt)
    Doc -> Doc -> Doc
$+$ Doc -> Doc
indent (Doc -> Doc
appendComma (Doc -> Doc
wrapRhs Doc
parRhs))
  where
    wrapRhs :: Doc -> Doc
wrapRhs = Bool -> (Doc -> Doc) -> Doc -> Doc
forall a. Bool -> (a -> a) -> a -> a
applyWhen (Bool
allowEmpty Bool -> Bool -> Bool
&& Cat -> Sym
forall a b. a -> Either a b
Left Cat
nt Sym -> KnownEmpty -> Bool
`isKnownEmpty` KnownEmpty
knownEmpty) ((Doc -> Doc) -> Doc -> Doc) -> (Doc -> Doc) -> Doc -> Doc
forall a b. (a -> b) -> a -> b
$
      Doc -> Doc
wrapOptional'

    ([Rule]
parsableRules, [Rule]
_) = (Rule -> Bool) -> [Rule] -> ([Rule], [Rule])
forall a. (a -> Bool) -> [a] -> ([a], [a])
List.partition Rule -> Bool
forall f. Rul f -> Bool
isParsable [Rule]
rules

    parRhs :: Doc
parRhs = [Doc] -> Doc
wrapChoice ([Rule] -> [Doc]
genRules [Rule]
parsableRules)

    genRules :: [Rule] -> [Doc]
genRules = (Rule -> Doc) -> [Rule] -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map Rule -> Doc
forall {function}. Pretty function => Rul function -> Doc
genRule
    genRule :: Rul function -> Doc
genRule Rul function
rule =
      (Doc
"//" Doc -> Doc -> Doc
<+> [Char] -> Doc
text (Doc -> [Char]
renderOneLine (Rul function -> Doc
forall a. Pretty a => a -> Doc
pretty Rul function
rule)) Doc -> Doc -> Doc
<+> Doc
";")
      Doc -> Doc -> Doc
$+$ ([OptSentForm] -> Doc
formatRhs ([OptSentForm] -> Doc)
-> (SentForm -> [OptSentForm]) -> SentForm -> Doc
forall b c a. (b -> c) -> (a -> b) -> a -> c
. KnownEmpty -> SentForm -> [OptSentForm]
transformEmptyMatches KnownEmpty
knownEmpty) (Rul function -> SentForm
forall function. Rul function -> SentForm
rhsRule Rul function
rule)

    renderOneLine :: Doc -> [Char]
renderOneLine = Style -> Doc -> [Char]
renderStyle (Style
style { mode = OneLineMode })

-- | Start a defined symbol block in tree-sitter grammar
defineSymbol :: String -> Doc
defineSymbol :: [Char] -> Doc
defineSymbol [Char]
name = [Doc] -> Doc
hsep [[Char] -> Doc
text [Char]
name Doc -> Doc -> Doc
<> Doc
":", [Char] -> Doc
text [Char]
"$", [Char] -> Doc
text [Char]
"=>"]

appendComma :: Doc -> Doc
appendComma :: Doc -> Doc
appendComma = (Doc -> Doc -> Doc
<> [Char] -> Doc
text [Char]
",")

commaJoin :: Bool -> [Doc] -> Doc
commaJoin :: Bool -> [Doc] -> Doc
commaJoin Bool
newline =
  (Doc -> Doc -> Doc) -> Doc -> [Doc] -> Doc
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl Doc -> Doc -> Doc
comma Doc
empty
  where
    commaString :: Doc
commaString = if Bool
newline then Doc
"," else Doc
", "
    comma :: Doc -> Doc -> Doc
comma Doc
a Doc
b
      | Doc -> Bool
isEmpty Doc
a = Doc
b
      | Doc -> Bool
isEmpty Doc
b = Doc
a
      | Bool
otherwise = (if Bool
newline then Doc -> Doc -> Doc
($+$) else Doc -> Doc -> Doc
(<>)) (Doc
a Doc -> Doc -> Doc
<> Doc
commaString) Doc
b

wrapSeq :: [Doc] -> Doc
wrapSeq :: [Doc] -> Doc
wrapSeq = [Char] -> Bool -> [Doc] -> Doc
wrapOptListFun [Char]
"seq" Bool
False

wrapChoice :: [Doc] -> Doc
wrapChoice :: [Doc] -> Doc
wrapChoice = [Char] -> Bool -> [Doc] -> Doc
wrapOptListFun [Char]
"choice" Bool
True

wrapOptional :: Doc -> Doc
wrapOptional :: Doc -> Doc
wrapOptional = [Char] -> Bool -> Doc -> Doc
wrapFun [Char]
"optional" Bool
False

wrapOptional' :: Doc -> Doc
wrapOptional' :: Doc -> Doc
wrapOptional' = [Char] -> Bool -> Doc -> Doc
wrapFun [Char]
"optional" Bool
True

-- | Wrap list using tree-sitter fun if the list contains multiple items
-- Returns the only item without wrapping otherwise
wrapOptListFun :: String -> Bool -> [Doc] -> Doc
wrapOptListFun :: [Char] -> Bool -> [Doc] -> Doc
wrapOptListFun [Char]
_   Bool
_ [Doc
x] = Doc
x
wrapOptListFun [Char]
fun Bool
_ [ ] = [Char] -> Bool -> Doc -> Doc
wrapFun [Char]
fun Bool
False Doc
empty
wrapOptListFun [Char]
fun Bool
newline [Doc]
list = [Char] -> Bool -> Doc -> Doc
wrapFun [Char]
fun Bool
newline (Bool -> [Doc] -> Doc
commaJoin Bool
newline [Doc]
list)

wrapFun :: String -> Bool -> Doc -> Doc
wrapFun :: [Char] -> Bool -> Doc -> Doc
wrapFun [Char]
fun Bool
newline Doc
arg = [Doc] -> Doc
joinOp [[Char] -> Doc
text [Char]
fun Doc -> Doc -> Doc
<> [Char] -> Doc
text [Char]
"(", Doc -> Doc
indentOp Doc
arg, [Char] -> Doc
text [Char]
")"]
  where
    joinOp :: [Doc] -> Doc
joinOp = if Bool
newline then [Doc] -> Doc
vcat' else [Doc] -> Doc
hcat
    indentOp :: Doc -> Doc
indentOp = if Bool
newline then Doc -> Doc
indent else Doc -> Doc
forall a. a -> a
id

-- | Helper for referring to non-terminal names in tree-sitter
refName :: String -> String
refName :: [Char] -> [Char]
refName = ([Char]
"$." [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++)

-- | Format right hand side into list of strings
formatRhs :: [OptSentForm] -> Doc
formatRhs :: [OptSentForm] -> Doc
formatRhs = [Doc] -> Doc
wrapChoice ([Doc] -> Doc) -> ([OptSentForm] -> [Doc]) -> [OptSentForm] -> Doc
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (OptSentForm -> Doc) -> [OptSentForm] -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map OptSentForm -> Doc
formatSent

formatSent :: OptSentForm -> Doc
formatSent :: OptSentForm -> Doc
formatSent = [Doc] -> Doc
wrapSeq ([Doc] -> Doc) -> (OptSentForm -> [Doc]) -> OptSentForm -> Doc
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (OptSym -> Doc) -> OptSentForm -> [Doc]
forall a b. (a -> b) -> [a] -> [b]
map OptSym -> Doc
fmtOpt
  where
    fmtOpt :: OptSym -> Doc
fmtOpt (Optional Sym
x) = Doc -> Doc
wrapOptional (Sym -> Doc
fmt Sym
x)
    fmtOpt (NonOptional Sym
x) = Sym -> Doc
fmt Sym
x

    fmt :: Sym -> Doc
fmt (Left Cat
c) = [Char] -> Doc
text ([Char] -> Doc) -> [Char] -> Doc
forall a b. (a -> b) -> a -> b
$ [Char] -> [Char]
refName ([Char] -> [Char]) -> [Char] -> [Char]
forall a b. (a -> b) -> a -> b
$ Bool -> Cat -> [Char]
formatCatName Bool
False Cat
c
    fmt (Right [Char]
term) = [Char] -> Doc
cstring [Char]
term

-- | Format string for cat name, prefix "_" if the name is for internal rules
formatCatName :: Bool -> Cat -> String
formatCatName :: Bool -> Cat -> [Char]
formatCatName Bool
internal Cat
c =
  if Bool
internal
    then [Char]
"_" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
formatted
    else [Char]
formatted
  where
    formatted :: [Char]
formatted = Cat -> [Char]
formatName Cat
c
    formatName :: Cat -> [Char]
formatName (Cat [Char]
name) = [Char]
name
    formatName (TokenCat [Char]
name) = [Char]
"token_" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
name
    formatName (ListCat Cat
c) = [Char]
"list_" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Cat -> [Char]
formatName Cat
c
    formatName (CoercCat [Char]
name Integer
i) = [Char]
name [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Integer -> [Char]
forall a. Show a => a -> [Char]
show Integer
i