Python: Add parser support for template strings

tausbn · tausbn · commit 287e18d02c54 · 2025-12-04T16:31:16.000Z
- Extends the scanner with a new token kind representing the start of a
template string. This is used to distinguish template strings from
regular strings (because only a template string will start with a
`_template_string_start` external token).

- Cleans up the logic surrounding interpolations (and the method names)
so that format strings and template strings behave the same in this
case.

Finally, we add two new node types in the tree-sitter grammar:

- `template_string` behaves like format strings, but is a distinct type
(mainly so that an implicit concatenation between template strings and
regular strings becomes a syntax error).
- `concatenated_template_string` is the counterpart of
`concatenated_string`.

However, internally, the string parts of a template strings are just the
same `string_content` nodes that are used in regular format strings. We
will disambiguate these inside `tsg-python`.
diff --git a/python/extractor/tsg-python/tsp/grammar.js b/python/extractor/tsg-python/tsp/grammar.js
@@ -55,6 +55,7 @@ module.exports = grammar({
     $._string_start,
     $._string_content,
     $._string_end,
+    $._template_string_start,
   ],
 
   inline: $ => [
@@ -423,6 +424,8 @@ module.exports = grammar({
       ),
       $.string,
       $.concatenated_string,
+      $.template_string,
+      $.concatenated_template_string,
       $.none,
       $.true,
       $.false
@@ -765,6 +768,8 @@ module.exports = grammar({
       $.keyword_identifier,
       $.string,
       $.concatenated_string,
+      $.template_string,
+      $.concatenated_template_string,
       $.integer,
       $.float,
       $.true,
@@ -1099,6 +1104,20 @@ module.exports = grammar({
       field('suffix', alias($._string_end, '"'))
     ),
 
+    concatenated_template_string: $ => seq(
+      $.template_string,
+      repeat1($.template_string)
+    ),
+
+    template_string: $ => seq(
+      field('prefix', alias($._template_string_start, '"')),
+      repeat(choice(
+        field('interpolation', $.interpolation),
+        field('string_content', $.string_content)
+      )),
+      field('suffix', alias($._string_end, '"'))
+    ),
+
     string_content: $ => prec.right(0, repeat1(
       choice(
         $._escape_interpolation,
diff --git a/python/extractor/tsg-python/tsp/src/scanner.cc b/python/extractor/tsg-python/tsp/src/scanner.cc
@@ -17,6 +17,7 @@ enum TokenType {
   STRING_START,
   STRING_CONTENT,
   STRING_END,
+  TEMPLATE_STRING_START,
 };
 
 struct Delimiter {
@@ -28,6 +29,7 @@ struct Delimiter {
     Format = 1 << 4,
     Triple = 1 << 5,
     Bytes = 1 << 6,
+    Template = 1 << 7,
   };
 
   Delimiter() : flags(0) {}
@@ -36,6 +38,14 @@ struct Delimiter {
     return flags & Format;
   }
 
+  bool is_template() const {
+    return flags & Template;
+  }
+
+  bool can_interpolate() const {
+    return is_format() || is_template();
+  }
+
   bool is_raw() const {
     return flags & Raw;
   }
@@ -59,6 +69,10 @@ struct Delimiter {
     flags |= Format;
   }
 
+  void set_template() {
+    flags |= Template;
+  }
+
   void set_raw() {
     flags |= Raw;
   }
@@ -154,7 +168,7 @@ struct Scanner {
       int32_t end_character = delimiter.end_character();
       bool has_content = false;
       while (lexer->lookahead) {
-        if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
+        if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.can_interpolate()) {
           lexer->mark_end(lexer);
           lexer->result_symbol = STRING_CONTENT;
           return has_content;
@@ -322,13 +336,17 @@ struct Scanner {
       }
     }
 
-    if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
+    bool expects_string_start = valid_symbols[STRING_START] || valid_symbols[TEMPLATE_STRING_START];
+
+    if (first_comment_indent_length == -1 && expects_string_start) {
       Delimiter delimiter;
 
       bool has_flags = false;
       while (lexer->lookahead) {
         if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
           delimiter.set_format();
+        } else if (lexer->lookahead == 't' || lexer->lookahead == 'T') {
+          delimiter.set_template();
         } else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
           delimiter.set_raw();
         } else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
@@ -372,7 +390,7 @@ struct Scanner {
 
       if (delimiter.end_character()) {
         delimiter_stack.push_back(delimiter);
-        lexer->result_symbol = STRING_START;
+        lexer->result_symbol = delimiter.is_template() ? TEMPLATE_STRING_START : STRING_START;
         return true;
       } else if (has_flags) {
         return false;