Parser Combinators

Let’s imagine we need to parse propositional logic expressions. First, we might construct an abstract syntax tree (AST) like so:

indirect enum Proposition: Equatable {
  case atom(String)
  case neg(Proposition)
  case impl(Proposition, Proposition)
  case conj(Proposition, Proposition)
  case disj(Proposition, Proposition)
  case paren(Proposition)
}

Here we have a case for each possible construct in our language. We have a base case for the leaves of our tree that allows us to name a simple proposition, a way to negate propositions, combine propositions with implications, conjunctions, or disjunctions, and finally a way to wrap propositions in parenthesis.

Next, let’s create some helpers to make our job easier.

extension Substring {
  var decomposed: (Character, Substring)? {
    return first.map { ($0, dropFirst()) }
  }
  
  func decomposed(_ length: Int) -> (Substring, Substring)? {
    guard count >= length else { return nil }
    let front = prefix(length)
    let back = dropFirst(length)
    return (front, back)
  }
  
  func split(at index: Index) -> (Substring, Substring)? {
    guard case startIndex...endIndex = index else { return nil }
    let front = self[startIndex..<index]
    let back = self[index..<endIndex]
    return (front, back)
  }
}

The decomposed property breaks our Substring into a pair where the first element is the first character of our Substring and the second element is the rest of the Substring. Unless the Substring is empty then we just get nil. The decomposed function allows use to decompose more than one character at a time. The split function allows us to divide the Substring at any index.

Next, with these helpers, let’s start writing our parser.

func parse(_ input: Substring) -> (Proposition, Substring)? {
  return parseParenthesis(input) ?? parseAtom(input) ?? parseNegation(input)
}

This top level function tries to parse each kind of term that we have first parenthesis, then an atom, and finally, a negation.

Let’s dive down into how each of them are implemented.

func parseParenthesis(_ input: Substring) -> (Proposition, Substring)? {
  guard let (first, remaining) = input.decomposed else { return nil }
  if first == "(" {
    guard let (prop1, remaining1) = parse(remaining),
      let (closeParen, remaining2) = remaining1.decomposed, closeParen == ")" else {
        return nil
    }
    let (prop2, remaining3) = (Proposition.paren(prop1), remaining2)
    let (prop3, remaining4) = parseOperation(prop2, remaining3) ?? (prop2, remaining3)
    return (prop3, remaining4)
  }
  else {
    return nil
  }
}

The function parseParenthesis gets the first character and makes sure that it’s a "(" otherwise it just produces nil. If it’s nil, then it tries to parse the rest of the input string. If that succeeds, it checks that the next character is a ")". If it is, it wraps the proposition in the paren case for our AST and then chains it forward by calling parseOperation.

The function parseAtom follows a similar structure:

func parseAtom(_ input: Substring) -> (Proposition, Substring)? {
  guard input != "" else { return nil }
  if let splitIndex = input.firstIndex(where: { isAlpha($0) != true }) {
    guard let (name, remaining1) = input.split(at: splitIndex), name != "" else {
      return nil
    }
    let prop = Proposition.atom(String(name))
    let (prop1, remaining3) = parseOperation(prop, remaining1) ?? (prop, remaining1)
    return (prop1, remaining3)
  }
  else {
    return (Proposition.atom(String(input)), "")
  }
}

func isAlpha(_ char: Character) -> Bool {
  return "a"..."z" ~= char || "A"..."Z" ~= char
}

The parseNegation function needs to do some inversion to wrap the correct part in the neg case.

func parseNegation(_ input: Substring) -> (Proposition, Substring)? {
  guard let (first, remaining) = input.decomposed else { return nil }
  if first == "~" {
    guard let (prop, remaining1) = parse(remaining) else { return nil }
    switch prop {
    case let .impl(left, right):
      return (.impl(.neg(left), right), remaining1)
    case let .conj(left, right):
      return (.conj(.neg(left), right), remaining1)
    case let .disj(left, right):
      return (.disj(.neg(left), right), remaining1)
    default:
      return (.neg(prop), remaining1)
    }
  }
  else {
    return nil
  }
}

The parseOperation function is very similar to the proposition function

func parseOperation(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)? {
  return parseImplication(left, input) ?? parseConjunction(left, input) ?? parseDisjunction(left, input)
}

It tries to parse each operation one by one.

The different operation parsing functions are almost exactly the same.

func parseImplication(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)? {
  guard let (operation, remaining) = input.decomposed(2),
    let (right, remaining1) = parse(remaining) else {
      return nil
  }
  
  if operation == "->" {
    return (Proposition.impl(left, right), remaining1)
  }
  else {
    return nil
  }
}

func parseConjunction(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)? {
  guard let (operation, remaining) = input.decomposed,
    let (right, remaining1) = parse(remaining) else {
      return nil
  }
  
  if operation == "&" {
    return (Proposition.conj(left, right), remaining1)
  }
  else {
    return nil
  }
}

func parseDisjunction(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)? {
  guard let (operation, remaining) = input.decomposed,
    let (right, remaining1) = parse(remaining) else {
      return nil
  }
  
  if operation == "|" {
    return (Proposition.disj(left, right), remaining1)
  }
  else {
    return nil
  }
}

Finally, we’ll create a failable init for Proposition.

extension Proposition {
  init?(parse input: String) {
    guard let (prop, remaining) = parse(Substring(input)), remaining.isEmpty else {
      return nil
    }
    self = prop
  }
}

Proposition(parse: "~(A->~B)") // => .neg(.paren(.impl(.atom("A"), .neg(.atom("B")))))

Finding the Abstraction

If we take a look at the function signatures for our parsing functions we notice a common pattern.

func parse(_ input: Substring) -> (Proposition, Substring)?
func parseParenthesis(_ input: Substring) -> (Proposition, Substring)?
func parseAtom(_ input: Substring) -> (Proposition, Substring)?
func parseNegation(_ input: Substring) -> (Proposition, Substring)?
func parseOperation(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)?
func parseImplication(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)?
func parseConjunction(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)? 
func parseDisjunction(_ left: Proposition, _ input: Substring) -> (Proposition, Substring)?

They all have the type: (Substring) -> (Proposition, Substring)?

Well, the operation parsers don’t, but we can just change them to their curried versions.

func parseOperation(_ left: Proposition) -> (Substring) -> (Proposition, Substring)?
func parseImplication(_ left: Proposition) -> (Substring) -> (Proposition, Substring)?
func parseConjunction(_ left: Proposition) -> (Substring) -> (Proposition, Substring)? 
func parseDisjunction(_ left: Proposition) -> (Substring) -> (Proposition, Substring)?

Now, once we give them their first argument, they also have the type: (Substring) -> (Proposition, Substring)?

So, we can create a typealias.

typealias PropositionParser = (Substring) -> (Proposition, Substring)?

And we can go even farther by parametrizing the parsed type.

typealias Parser<Parsed> = (Substring) -> (Parsed, Substring)?

Now we have a parser of potentially anything, but it would be nice if we could write extensions on this type and add some methods. Unfortunately, since functions are structural types and structural types can’t currently have extensions, we’re unable to. To get around this limitation let’s change our parser into a struct.

struct Parser<Parsed> {
  fileprivate let parse: (Substring) -> (Parsed, Substring)?
  
  init(_ parser: @escaping (Substring) -> (Parsed, Substring)?) {
    parse = parser
  }
  
  func parsing(_ input: String) -> Parsed? {
    guard let (parsed, remaining) = parse(Substring(input)), remaining.isEmpty else { return nil }
    return parsed
  }
}

Now, there’s a property called parse that’s our parsing function. We’ve also added an init and parsing method to improve our API.

We could now simply go back and rewrite our parsing functions to return this Parser type, but that wouldn’t be very interesting. Instead, let’s think about the kind of operations we could perform on Parser’s in the abstract.

extension Parser {
  func map<Mapped>(_ transform: @escaping (Parsed) -> Mapped) -> Parser<Mapped> {
    return Parser<Mapped> { input in
      guard let (parsed, remaining) = self.parse(input) else { return nil }
      let mapped = transform(parsed)
      return (mapped, remaining)
    }
  }
}

The map method allows use to switch out the parsed data with some other data. The parser remembers to do this extra transformation after it’s finished parsing.

extension Parser {
  static var fail: Parser {
    return Parser { input in nil }
  }

  static func pure(_ value: Parsed) -> Parser {
    return Parser { input in (value, input) }
  }
}

Here we’ve created some very simple parsers. One that always fails and another that just produces the value we passed in without doing any parsing at all.

extension Parser {
  func flatMap<Mapped>(_ transform: @escaping (Parsed) -> Parser<Mapped>) -> Parser<Mapped> {
    return Parser<Mapped> { input in
      guard let (parsed, remaining) = self.parse(input) else { return nil }
      let newParser = transform(parsed)
      return newParser.parse(remaining)
    }
  }
}

The flatMap method gives us a very dynamic behavior. It allows us to parse part of the input string and then use the data that was parsed to create a new parser. That new parser then takes over parsing the remaining unconsumed string.

Building off of map and flatMap we can create a bracket method.

extension Parser {
  func bracket(open: Parser<Character>, close: Parser<Character>) -> Parser {
    return open.flatMap { _ in self.flatMap { parsed in close.map { _ in parsed } } }
  }
}

Here we begin by parsing with open. If that succeeds, we end up in the closure following it. We just use a _ as the parameter name to ignore the character since we don’t need it. We then parse with self, and finally we parse with close also throwing away the resulting character. If all of those parsers worked, we just produce parsed which is the parsed content inside our brackets.

We’ll need a way to combine parsers as different alternatives to try.

func ?? <P>(left: @escaping @autoclosure () -> Parser<P>,
            right: @escaping @autoclosure () -> Parser<P>) -> Parser<P> {
  return Parser { input in
    left().parse(input) ?? right().parse(input)
  }
}

The ?? operator (called nil-coalescing) has an “or else” like meaning and is well suited to represent our alternative function. We need to make both of its parameters auto closures to prevent infinite recursion from Swift’s eager evaluation.

Next, we’ll make some parsers to parse individual characters.

extension Parser where Parsed == Character {
  static var character: Parser {
    return Parser { input in input.first.map { ($0, input.dropFirst()) } }
  }
  
  static func matching(_ char: Character) -> Parser {
    return character.flatMap { $0 == char ? .pure(char) : .fail }
  }
}

The character static property simply parses a single character, and the matching static function parses a character if it matches the one passed in.

We’ll need some operations for when parsing substrings.

extension Parser where Parsed == Substring {
  static func substring(while predicate: @escaping (Character) -> Bool) -> Parser {
    let index = Parser<Substring.Index> { input in
      let i = input.firstIndex(where: { predicate($0) != true }) ?? input.endIndex
      guard i > input.startIndex else { return nil }
      return (i, input)
    }
    return index.flatMap(split)
  }
  
  static func split(at index: Substring.Index) -> Parser {
    return Parser { input in
      guard case input.startIndex...input.endIndex = index else { return nil }
      let front = input[input.startIndex..<index]
      let back = input[index..<input.endIndex]
      return (front, back)
    }
  }
}

The substring(while:) method allows us to parse while some predicate is true of the characters being parsed, and with the split method, we can divide the input string at any arbitrary index.

Finally, we can have a parser that can match anything that implements StringProtocol.

extension Parser where Parsed: StringProtocol {
  static func matching(_ string: Parsed) -> Parser {
    return Parser { input in
      guard input.hasPrefix(string) else { return nil }
      return (string, input.dropFirst(string.count))
    }
  }
}

With these parser operations, we’re ready to start rewriting our Proposition parser at a higher semantic level.

Let’s begin by declaring our top level parser.

extension Parser where Parsed == Proposition {
  static var proposition: Parser {
    return parenthesis ?? atom ?? negation
  }
}

Here we combine our yet to be written term parsers with our alternative operator.

Next, let’s write those term parsers.

extension Parser where Parsed == Proposition {
  private static var parenthesis: Parser {
    return proposition
      .bracket(open: .matching("("), close: .matching(")"))
      .map(Proposition.paren)
      .flatMap { prop in operation(left: prop) ?? .pure(prop) }
  }
}

For our parenthesis parser, we start with our proposition parser, bracket it, wrap it in the paren case in our AST, and finally, chain it with our yet to be written operation parser.

Our atom parser:

extension Parser where Parsed == Proposition {
  private static var atom: Parser {
    return Parser<Substring>.substring(while: { "a"..."z" ~= $0 || "A"..."Z" ~= $0 })
      .map(String.init)
      .map(Proposition.atom)
      .flatMap { prop in operation(left: prop) ?? .pure(prop) }
  }
}

We parse the input while the characters are letters, turn the Substring into a String, wrap it in our atom case of our AST, and chain it with operation.

The negation parser first parses the "~" character, parses the proposition, wraps it in our neg case, and does some inversion of the AST to wrap the correct term.

extension Parser where Parsed == Proposition {
  private static var negation: Parser {
    return Parser<Character>.matching("~")
      .flatMap { _ in Parser<Proposition>.proposition }
      .map(Proposition.neg)
      .map(invertNegation)
  }
  
  private static func invertNegation(_ prop: Proposition) -> Proposition {
    switch prop {
    case let .neg(.impl(left, right)): return .impl(.neg(left), right)
    case let .neg(.conj(left, right)): return .conj(.neg(left), right)
    case let .neg(.disj(left, right)): return .disj(.neg(left), right)
    default: return prop
    }
  }
}

Next, we’ll create the operations that will chain our proposition terms together.

At the highest level, we’ll once again combine our individual operation parsers as alternatives.

extension Parser where Parsed == Proposition {
  private static func operation(left: Proposition) -> Parser {
    return implication(left: left) ?? conjunction(left: left) ?? disjunction(left: left)
  }
}

The operation parsers:

extension Parser where Parsed == Proposition {
  private static func implication(left: Proposition) -> Parser {
    return Parser<String>.matching("->")
      .flatMap { _ in Parser<Proposition>.proposition }
      .map { right in .impl(left, right) }
  }
  
  private static func conjunction(left: Proposition) -> Parser {
    return Parser<Character>.matching("&")
      .flatMap { _ in Parser<Proposition>.proposition }
      .map { right in .conj(left, right) }
  }
  
  private static func disjunction(left: Proposition) -> Parser {
    return Parser<Character>.matching("|")
      .flatMap { _ in Parser<Proposition>.proposition }
      .map { right in .disj(left, right) }
  }
}

Parser.proposition.parsing("~A->~B") // => .impl(.neg(.atom("A")), .neg(.atom("B")))

They each do basically the same thing. They parse the appropriate operator, then parse the next proposition, and finally, wrap everything up in our AST.

That’s it. Our parser is complete. The parser combinator pattern has allowed us to express our parser at a much higher semantic level, but what we’ve done here is merely scratching the surface of this technique. In a future post, we’ll explore this technique in more depth and raise the parser abstraction even higher.