Skip to content

Commit 287e18d

Browse files
committed
Python: Add parser support for template strings
- Extends the scanner with a new token kind representing the start of a template string. This is used to distinguish template strings from regular strings (because only a template string will start with a `_template_string_start` external token). - Cleans up the logic surrounding interpolations (and the method names) so that format strings and template strings behave the same in this case. Finally, we add two new node types in the tree-sitter grammar: - `template_string` behaves like format strings, but is a distinct type (mainly so that an implicit concatenation between template strings and regular strings becomes a syntax error). - `concatenated_template_string` is the counterpart of `concatenated_string`. However, internally, the string parts of a template strings are just the same `string_content` nodes that are used in regular format strings. We will disambiguate these inside `tsg-python`.
1 parent 8b89e15 commit 287e18d

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

python/extractor/tsg-python/tsp/grammar.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ module.exports = grammar({
5555
$._string_start,
5656
$._string_content,
5757
$._string_end,
58+
$._template_string_start,
5859
],
5960

6061
inline: $ => [
@@ -423,6 +424,8 @@ module.exports = grammar({
423424
),
424425
$.string,
425426
$.concatenated_string,
427+
$.template_string,
428+
$.concatenated_template_string,
426429
$.none,
427430
$.true,
428431
$.false
@@ -765,6 +768,8 @@ module.exports = grammar({
765768
$.keyword_identifier,
766769
$.string,
767770
$.concatenated_string,
771+
$.template_string,
772+
$.concatenated_template_string,
768773
$.integer,
769774
$.float,
770775
$.true,
@@ -1099,6 +1104,20 @@ module.exports = grammar({
10991104
field('suffix', alias($._string_end, '"'))
11001105
),
11011106

1107+
concatenated_template_string: $ => seq(
1108+
$.template_string,
1109+
repeat1($.template_string)
1110+
),
1111+
1112+
template_string: $ => seq(
1113+
field('prefix', alias($._template_string_start, '"')),
1114+
repeat(choice(
1115+
field('interpolation', $.interpolation),
1116+
field('string_content', $.string_content)
1117+
)),
1118+
field('suffix', alias($._string_end, '"'))
1119+
),
1120+
11021121
string_content: $ => prec.right(0, repeat1(
11031122
choice(
11041123
$._escape_interpolation,

python/extractor/tsg-python/tsp/src/scanner.cc

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ enum TokenType {
1717
STRING_START,
1818
STRING_CONTENT,
1919
STRING_END,
20+
TEMPLATE_STRING_START,
2021
};
2122

2223
struct Delimiter {
@@ -28,6 +29,7 @@ struct Delimiter {
2829
Format = 1 << 4,
2930
Triple = 1 << 5,
3031
Bytes = 1 << 6,
32+
Template = 1 << 7,
3133
};
3234

3335
Delimiter() : flags(0) {}
@@ -36,6 +38,14 @@ struct Delimiter {
3638
return flags & Format;
3739
}
3840

41+
bool is_template() const {
42+
return flags & Template;
43+
}
44+
45+
bool can_interpolate() const {
46+
return is_format() || is_template();
47+
}
48+
3949
bool is_raw() const {
4050
return flags & Raw;
4151
}
@@ -59,6 +69,10 @@ struct Delimiter {
5969
flags |= Format;
6070
}
6171

72+
void set_template() {
73+
flags |= Template;
74+
}
75+
6276
void set_raw() {
6377
flags |= Raw;
6478
}
@@ -154,7 +168,7 @@ struct Scanner {
154168
int32_t end_character = delimiter.end_character();
155169
bool has_content = false;
156170
while (lexer->lookahead) {
157-
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
171+
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.can_interpolate()) {
158172
lexer->mark_end(lexer);
159173
lexer->result_symbol = STRING_CONTENT;
160174
return has_content;
@@ -322,13 +336,17 @@ struct Scanner {
322336
}
323337
}
324338

325-
if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
339+
bool expects_string_start = valid_symbols[STRING_START] || valid_symbols[TEMPLATE_STRING_START];
340+
341+
if (first_comment_indent_length == -1 && expects_string_start) {
326342
Delimiter delimiter;
327343

328344
bool has_flags = false;
329345
while (lexer->lookahead) {
330346
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
331347
delimiter.set_format();
348+
} else if (lexer->lookahead == 't' || lexer->lookahead == 'T') {
349+
delimiter.set_template();
332350
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
333351
delimiter.set_raw();
334352
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
@@ -372,7 +390,7 @@ struct Scanner {
372390

373391
if (delimiter.end_character()) {
374392
delimiter_stack.push_back(delimiter);
375-
lexer->result_symbol = STRING_START;
393+
lexer->result_symbol = delimiter.is_template() ? TEMPLATE_STRING_START : STRING_START;
376394
return true;
377395
} else if (has_flags) {
378396
return false;

0 commit comments

Comments
 (0)