Skip to content

Commit 32f6558

Browse files
Added: grammar parsing (#3)
* grammar added * clang-tidy fix * Fixed: run_clang_format.py * Constructors, getters and setters * Added: Grammar Parser * Fixed: lint * Updated: grammar file parser * Added: grammar.g * Added: grammar.g Co-authored-by: noob77777 <[email protected]>
1 parent df0592d commit 32f6558

26 files changed

+1199
-15
lines changed

README.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,17 @@ $ mkdir build
3333
$ cd build
3434
$ cmake -GNinja -DCMAKE_BUILD_TYPE=Release ..
3535
$ ninja
36-
$ ninja test
37-
36+
$ ./bin/jucc_test
3837
```
3938

40-
Before pushing or making a pull request
39+
Before pushing or making a pull request ( The tests must pass, compulsory !! )
4140

4241
```
4342
$ ninja
4443
$ ninja check-format
4544
$ ninja check-clang-tidy
4645
$ ninja check-lint
47-
$ ninja test ( The tests must pass, compulsory !! )
48-
46+
$ ninja test
4947
```
5048

5149
To add a new unit test, make a folder with the same relative path as in the src folder, and define your test. Please refer to [docs](https://github.com/TheSYNcoder/JuCC/tree/main/docs/) for more details about writing tests using the [googletest](https://github.com/google/googletest) framework.

build_support/run_clang_format.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def check(arguments, source_dir):
9696
# Print out the diff to stderr
9797
error = True
9898
sys.stderr.writelines(diff)
99+
99100
return error
100101

101102

@@ -127,6 +128,6 @@ def check(arguments, source_dir):
127128
exclude_globs = [line.strip() for line in open(args.exclude_globs)]
128129
for source_dir in args.source_dirs.split(','):
129130
if len(source_dir) > 0:
130-
had_err = check(args, source_dir)
131+
had_err = had_err or check(args, source_dir)
131132

132133
sys.exit(1 if had_err else 0)

src/grammar/grammar.cpp

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
#include "grammar/grammar.h"
2+
3+
#include <sstream>
4+
#include <unordered_map>
5+
#include <unordered_set>
6+
7+
namespace jucc::grammar {
8+
9+
Parser::Parser(const char *filepath) { file_ = std::ifstream(filepath); }
10+
11+
Parser::~Parser() {
12+
if (file_.is_open()) {
13+
file_.close();
14+
}
15+
}
16+
17+
std::vector<std::string> Parser::FastTokenize(const std::string &s) {
18+
std::vector<std::string> res;
19+
std::stringstream ss(s);
20+
std::string token;
21+
while (ss >> token) {
22+
res.push_back(token);
23+
}
24+
return res;
25+
}
26+
27+
/**
28+
* This is based on a basic state machine that implicitly uses a grammar to parse.
29+
* The parse states represent a block of a .g grammar file.
30+
* Eg. ParseState TERMINALS imples we have seen a %terminals token and yet to see
31+
* a % block closing token.
32+
* RuleState keeps track of additonal states required to parse a rule inside of a
33+
* %rule block.
34+
*/
35+
bool Parser::Parse() {
36+
// Parser State
37+
enum ParseState { BASIC, TERMINALS, NON_TERMINALS, START, RULES };
38+
39+
// Parser State while parsing productions
40+
// only valid when Parser State is RULES
41+
enum RuleState { LEFT, COLON, ENTITY };
42+
43+
const std::string terminals_start = "%terminals";
44+
const std::string non_terminals_start = "%non_terminals";
45+
const std::string start_start = "%start";
46+
const std::string rules_start = "%rules";
47+
const std::string state_reset_token = "%";
48+
49+
// check if file is open else return false
50+
if (file_.is_open()) {
51+
ParseState curr_parse_state = BASIC;
52+
std::string line;
53+
54+
// ugh...
55+
std::unordered_map<std::string, std::vector<std::vector<std::string>>> grammar;
56+
57+
while (getline(file_, line)) {
58+
// states for rule parsing
59+
RuleState curr_rule_state = LEFT;
60+
std::string production_parent;
61+
std::vector<std::string> rule_entities;
62+
63+
// ignore empty lines and comments
64+
if (line.length() == 0 || line[0] == '#') {
65+
continue;
66+
}
67+
68+
// tokenize line
69+
std::vector<std::string> tokens = FastTokenize(line);
70+
71+
// update parser state
72+
if (tokens[0] == terminals_start) {
73+
if (curr_parse_state != BASIC || tokens.size() > 1) {
74+
error_ = "grammar parsing error: invalid token %terminals";
75+
return false;
76+
}
77+
curr_parse_state = TERMINALS;
78+
tokens.erase(tokens.begin());
79+
} else if (tokens[0] == non_terminals_start) {
80+
if (curr_parse_state != BASIC || tokens.size() > 1) {
81+
error_ = "grammar parsing error: invalid token %non_terminals";
82+
return false;
83+
}
84+
curr_parse_state = NON_TERMINALS;
85+
tokens.erase(tokens.begin());
86+
} else if (tokens[0] == start_start) {
87+
if (curr_parse_state != BASIC || tokens.size() > 1) {
88+
error_ = "grammar parsing error: invalid token %start";
89+
return false;
90+
}
91+
curr_parse_state = START;
92+
tokens.erase(tokens.begin());
93+
} else if (tokens[0] == rules_start) {
94+
if (curr_parse_state != BASIC || tokens.size() > 1) {
95+
error_ = "grammar parsing error: invalid token %rules";
96+
return false;
97+
}
98+
curr_parse_state = RULES;
99+
tokens.erase(tokens.begin());
100+
} else if (tokens[0] == state_reset_token) {
101+
if (curr_parse_state == BASIC || tokens.size() > 1) {
102+
error_ = "grammar parsing error: invalid token %";
103+
return false;
104+
}
105+
curr_parse_state = BASIC;
106+
tokens.erase(tokens.begin());
107+
}
108+
109+
// iterate over tokens
110+
// add tokens to different lists as a function of Parser State
111+
// for Parser State RULES parse the production following implicit grammar:
112+
// "LEFT : ENTITY_LIST"
113+
// "ENTITY_LIST: ENTITY ENTITY_LIST | ENTITY"
114+
for (const auto &token : tokens) {
115+
switch (curr_parse_state) {
116+
case BASIC:
117+
error_ = "grammar parsing error: invalid token outside block: " + token;
118+
return false;
119+
break;
120+
121+
case TERMINALS:
122+
if (token == std::string(EPSILON)) {
123+
error_ = "grammar parsing error: EPSILON is reserved";
124+
return false;
125+
}
126+
terminals_.push_back(token);
127+
break;
128+
129+
case NON_TERMINALS:
130+
if (token == std::string(EPSILON)) {
131+
error_ = "grammar parsing error: EPSILON is reserved";
132+
return false;
133+
}
134+
non_terminals_.push_back(token);
135+
break;
136+
137+
case START:
138+
if (!start_symbol_.empty() || token == std::string(EPSILON)) {
139+
error_ = "grammar parsing error: ambiguous start symbol";
140+
return false;
141+
}
142+
start_symbol_ = token;
143+
break;
144+
145+
case RULES:
146+
switch (curr_rule_state) {
147+
case LEFT:
148+
if (token == std::string(EPSILON)) {
149+
error_ = "grammar parsing error: production cannot start with EPSILON";
150+
return false;
151+
}
152+
production_parent = token;
153+
curr_rule_state = COLON;
154+
break;
155+
156+
case COLON:
157+
if (token != ":") {
158+
error_ = "grammar parsing error: rules syntax error ':' expected: " + token;
159+
return false;
160+
}
161+
curr_rule_state = ENTITY;
162+
break;
163+
164+
case ENTITY:
165+
rule_entities.push_back(token);
166+
break;
167+
168+
default:
169+
break;
170+
}
171+
default:
172+
break;
173+
}
174+
}
175+
176+
if (curr_parse_state == RULES) {
177+
if (curr_rule_state == ENTITY) {
178+
grammar[production_parent].push_back(rule_entities);
179+
}
180+
if (curr_rule_state == COLON) {
181+
error_ = "grammar parsing error: rules syntax error ':' expected";
182+
return false;
183+
}
184+
}
185+
}
186+
187+
// sanity checks
188+
if (curr_parse_state != BASIC) {
189+
error_ = "grammar parsing error: block is incomplete '%' expected";
190+
return false;
191+
}
192+
193+
std::unordered_set<std::string> terminals;
194+
std::unordered_set<std::string> non_terminals;
195+
196+
for (const auto &terminal : terminals_) {
197+
terminals.insert(terminal);
198+
}
199+
200+
if (terminals.size() != terminals_.size()) {
201+
error_ = "grammar parsing error: inconsistent terminals";
202+
return false;
203+
}
204+
205+
for (const auto &nterminal : non_terminals_) {
206+
if (terminals.find(nterminal) != terminals.end()) {
207+
error_ = "grammar parsing error: terminals and non_terminals not disjoint";
208+
return false;
209+
}
210+
non_terminals.insert(nterminal);
211+
}
212+
213+
if (non_terminals.size() != non_terminals_.size()) {
214+
error_ = "grammar parsing error: inconsistent non_terminals";
215+
return false;
216+
}
217+
218+
// convert std::unordered_map to Production object with checks
219+
for (const auto &production : grammar) {
220+
Production prod;
221+
if (non_terminals.find(production.first) == non_terminals.end()) {
222+
error_ = "grammar parsing error: non_terminal not found: " + production.first;
223+
return false;
224+
}
225+
prod.SetParent(production.first);
226+
227+
std::vector<Rule> rules;
228+
for (const auto &rule : production.second) {
229+
Rule prod_rule;
230+
for (const auto &entity : rule) {
231+
if (non_terminals.find(entity) == non_terminals.end() && terminals.find(entity) == terminals.end() &&
232+
entity != std::string(EPSILON)) {
233+
error_ = "grammar parsing error: rule token is not defined: " + entity;
234+
return false;
235+
}
236+
}
237+
prod_rule.SetEntities(rule);
238+
rules.push_back(prod_rule);
239+
}
240+
241+
prod.SetRules(rules);
242+
243+
// add Production object created to list of valid productions
244+
grammar_.push_back(prod);
245+
}
246+
247+
return true;
248+
}
249+
250+
error_ = "grammar parsing error: file not found";
251+
return false;
252+
}
253+
} // namespace jucc::grammar

src/grammar/grammar.g

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
## This is the grammar file for JuCC
2+
## Edit this file to make changes to the parsing grammar
3+
## Epsilon is represented by special string EPSILON
4+
5+
## Terminals
6+
%terminals
7+
else float if int void
8+
( ) { } * + - / % ,
9+
<< >> < > <= >= = == != ;
10+
identifier integer_constant float_constant
11+
%
12+
13+
## Non Terminals
14+
%non_terminals
15+
<primary_expression> <constant> <unary_operator> <unary_expression>
16+
<cast_expression> <type_specifier> <multiplicative_expression>
17+
<additive_expression> <shift_expression> <relational_expression>
18+
<equality_expression> <assignment_expression> <expression>
19+
<declaration> <init_declarator_list> <init_declarator>
20+
<initializer> <declarator> <direct_declarator>
21+
<statement> <compound_statement> <block_item_list> <block_item>
22+
<expression_statement> <selection_statement> <program>
23+
%
24+
25+
## Start Symbol
26+
%start
27+
<program>
28+
%
29+
30+
## Grammar for the language
31+
%rules
32+
## Expressions
33+
<primary_expression> : identifier
34+
<primary_expression> : <constant>
35+
<primary_expression> : ( <expression> )
36+
<constant> : integer_constant
37+
<constant> : float_constant
38+
<unary_operator> : +
39+
<unary_operator> : -
40+
<unary_expression> : <primary_expression>
41+
<unary_expression> : <unary_operator> <primary_expression>
42+
<cast_expression> : <unary_expression>
43+
<cast_expression> : ( <type_specifier> ) <cast_expression>
44+
<multiplicative_expression> : <multiplicative_expression> * <cast_expression>
45+
<multiplicative_expression> : <multiplicative_expression> / <cast_expression>
46+
<multiplicative_expression> : <multiplicative_expression> % <cast_expression>
47+
<additive_expression> : <multiplicative_expression>
48+
<additive_expression> : <additive_expression> + <multiplicative_expression>
49+
<additive_expression> : <additive_expression> - <multiplicative_expression>
50+
<shift_expression> : <additive_expression>
51+
<shift_expression> : <shift_expression> << <additive_expression>
52+
<shift_expression> : <shift_expression> >> <additive_expression>
53+
<relational_expression> : <shift_expression>
54+
<relational_expression> : <relational_expression> < <shift_expression>
55+
<relational_expression> : <relational_expression> > <shift_expression>
56+
<relational_expression> : <relational_expression> <= <shift_expression>
57+
<relational_expression> : <relational_expression> >= <shift_expression>
58+
<equality_expression> : <relational_expression>
59+
<equality_expression> : <equality_expression> == <relational_expression>
60+
<equality_expression> : <equality_expression> != <relational_expression>
61+
<assignment_expression> : <equality_expression>
62+
<assignment_expression> : identifier = <assignment_expression>
63+
<expression> : <assignment_expression>
64+
65+
## Declarations
66+
<declaration> : <type_specifier> <init_declarator_list> ;
67+
<init_declarator_list> : <init_declarator>
68+
<init_declarator_list> : <init_declarator_list> , <init_declarator>
69+
<init_declarator_list> : EPSILON
70+
<init_declarator> : <declarator>
71+
<init_declarator> : <declarator> = <initializer>
72+
<type_specifier> : void
73+
<type_specifier> : int
74+
<type_specifier> : float
75+
<declarator> : <direct_declarator>
76+
<direct_declarator> : identifier
77+
<direct_declarator> : ( <declarator> )
78+
<initializer> : <assignment_expression>
79+
80+
## Statements
81+
<statement> : <compound_statement>
82+
<statement> : <expression_statement>
83+
<statement> : <selection_statement>
84+
<compound_statement> : { <block_item_list> }
85+
<block_item_list> : <block_item>
86+
<block_item_list> : <block_item_list> <block_item>
87+
<block_item_list> : EPSILON
88+
<block_item> : <declaration>
89+
<block_item> : <statement>
90+
<expression_statement> : <expression> ;
91+
<expression_statement> : ;
92+
<selection_statement> : if ( <expression> ) <statement>
93+
<selection_statement> : if ( <expression> ) <statement> else <statement>
94+
95+
## Main
96+
<program> : <type_specifier> identifier ( ) <compound_statement>
97+
%

0 commit comments

Comments
 (0)