Recursive Regex for Parsing SGF with JS

1. What SGF is

SGF is what’s widely used to save Go (board game) games as text. In essence, it’s basically the text encoding of something like a trie.

Here’s an example of what it looks like — I suggest using software like the Sabaki editor to create files and then look at the resulting text in it —:

(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd](;B[pq];W[dp])(;B[dp];W[pp]))

A more readable version, only for reference — the comments are not part of the SGF specification actually —:

(
  ;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25] // Game Metadata
  ;B[pd] // Black's Move (`pd` = coordinates on the board)
  ;W[dd] // White's Move
    ( // Parentheses denote a branch in the tree
      ;B[pq]
      ;W[dp]
    )
    (
      ;B[dp]
      ;W[pp]
    )
)

The whole SGF grammar is described like this:

Collection     = { GameTree }
GameTree       = "(" RootNode NodeSequence { Tail } ")"
Tail           = "(" NodeSequence { Tail } ")"
NodeSequence   = { Node }
RootNode       = Node
Node           = ";" { Property }
Property       = PropIdent PropValue { PropValue }
PropIdent      = UcLetter { UcLetter }
PropValue      = "[" Value "]"
UcLetter       = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
                 "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
                 "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"

2. What I’m Looking For

What I’m looking for is a recursive regex capable of turning an SGF string down to a recursive tree object in JS. Checking if the file is valid SGF would be a plus, but not necessary.

The tree shape would be something like this — this one comes from Sabaki’s SGF Parser —:

{
  id: <Primitive>,
  data: {
    [property]: <Array<String>>
  },
  parentId: <Primitive> | null,
  children: <Array<NodeObject>>
}

ECMA, the standard for JS, doesn’t comprise recursive regexes. So, an external package would probably necessary, like the XRegExp package, mentioned in this answer.

3. My Non-Regex Solution

Others have, in the past, parsed SGF through various means, I’ll leave my own version here for reference, as it might help in understanding the problem — you can also find it here on Github. It uses a tree class and some sort of flattened recursion:

export class SgfTree {
  constructor(
    // The pointer to the parent isn't really necessary, but
    // it makes parsing much easier.
    public parent?: SgfTree,
    public data: string = "",
    public children: SgfTree[] = []
  ) {}

  toJSON(): Object {
    return {
      data: this.data,
      children: this.children.map((c) => c.toJSON()),
    };
  }
}

// An SGF tree is basically a *trie* data structure encoded
// in text.
//
// I bet you could also do the whole parsing with only
// Regexes. (I think I'm gonna create a Stack Overflow
// question for this.)
export function parseSgf(sgf: string) {
  // 1. Cleanup
  sgf = sgf
    .replaceAll("n", "")
    .replaceAll("t", "")
    .replaceAll(" ", "")
    .trim();

  // 2. Initialization
  const trees = new SgfTree();
  let currentTree: SgfTree = trees;
  let currentString: string = "";

  // 3. Flattened Recursion
  for (const char of sgf) {
    switch (char) {
      case "(":
        // 3.1. Opening a Branch
        currentTree.data = currentString;
        const newTree = new SgfTree(currentTree);
        currentTree.children.push(newTree);
        currentTree = newTree;
        currentString = "";
        break;
      case ")":
        // 3.2. Closing the Current Branch and Going Back to the
        //      Parent.
        parseMovesAndMetadata(currentString);
        currentTree.data = currentString;
        currentTree = currentTree.parent!;
        currentString = currentTree.data;
        break;
      default:
        currentString += char;
    }
  }

  return trees.children;
}

// Not all the SGF fields, but probably the most common ones...
export type SgfData = {
  // 1. Metadata
  GM?: "1"; // Game Type (GM = "1" is Go)
  FF?: string; // File Format
  CA?: string; // Character Set
  AP?: string; // Application used to produce the file
  // 2. Game Info
  KM?: string; // Komi
  SZ?: string; // Board Size
  DT?: string; // Date
  HA?: string; // Number of Handicap Stones
  RU?: string; // Rules Set in Use
  GN?: string; // Game Name
  EV?: string; // Event
  // 3. Players
  PB?: string; // Black Player
  BR?: string; // Black's Rating
  PW?: string; // White Player
  WR?: string; // White's Rating
  // 4. Comments
  C?: string; // (Move) Comments
  GC?: string; // Game Comment
  // 5. Editing the Goban
  PL?: string; // Who plays next
  AB?: string; // Add Black stones
  AW?: string; // Add White stones
  // 6. Moves
  B?: string; // What Black plays
  W?: string; // What White Plays
};

// TODO: Complete
function parseMovesAndMetadata(sgfData: string) {
  const metadataAndMoves = sgfData
    .split(";")
    .filter((m) => m !== "");

  const regex =
    /(?<key>[A-Z](?:s*[A-Z])*)[(?<value>(?:\]|[^]])*)/g;
  const matches = [...metadataAndMoves[0].matchAll(regex)];

  console.log(matches[0].groups!["value"]);
}

// Straight Branch
const test1 = `
  (
    ;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25]
    ;B[pd]
    ;W[dd]
    ;B[pq]
    ;W[dp]
  )
`;
// Two Branches
const test2 = `
  (
    ;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25]
    ;B[pd]
    ;W[dd]
      (
        ;B[pq]
        ;W[dp]
      )
      (
        ;B[dp]
        ;W[pp]
      )
  )
`;
// Two Branches + Added (Edited) Stones
const test3 = `
  (
    ;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25]
    ;B[pd]
    ;W[dd]
      (
        ;B[pq]
        ;W[dp]
      )
      (
        ;B[dp]
        ;W[pp]
        ;PL[B]AE[jk]AB[jj]AW[ji]
        ;B[jq]
      )
  )
`;
// Two Branches + Added (Edited) Stones + Comments
const test4 = `
  (
    ;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25]
    ;B[pd]C[Comment on move.]
    ;W[dd]
      (
        ;B[pq]
        ;W[dp]
      )
      (
        ;B[dp]
        ;W[pp]
        ;PL[B]AE[jk]AB[jj]AW[ji]C[Comment on editing.]
        ;B[jq]
      )
  )
`;

const sgf = parseSgf(test4);
const sgfAsJSON = sgf.map((c) => c.toJSON());
const prettyPrintSgf = JSON.stringify(sgfAsJSON, null, 2);

// console.log(prettyPrintSgf);

4. References