1 // Copyright 2018 The Fuchsia Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef ZIRCON_SYSTEM_HOST_BANJO_INCLUDE_BANJO_FORMATTER_H_
6 #define ZIRCON_SYSTEM_HOST_BANJO_INCLUDE_BANJO_FORMATTER_H_
7 
8 #include <memory>
9 #include <regex>
10 #include <sstream>
11 
12 #include "tree_visitor.h"
13 
14 // A TreeVisitor (see tree_visitor.h) that pretty-prints BANJO code.
15 
16 namespace banjo {
17 namespace raw {
18 
19 class ScopedBool {
20 public:
21     ScopedBool(bool& source, bool value = true)
prev_value_(source)22         : prev_value_(source), source_(source) {
23         source_ = value;
24     }
~ScopedBool()25     ~ScopedBool() {
26         source_ = prev_value_;
27     }
28 
29 private:
30     bool prev_value_;
31     bool& source_;
32 };
33 
34 class ScopedIncrement {
35 public:
ScopedIncrement(int & source)36     ScopedIncrement(int& source)
37         : source_(source) {
38         source_++;
39     }
~ScopedIncrement()40     ~ScopedIncrement() {
41         source_--;
42     }
43 
44 private:
45     int& source_;
46 };
47 
48 // A Visitor that pretty prints its AST and makes its result available via
49 // formatted_output().
50 //
51 // Implementation note: The visitor mostly does the same thing on every node,
52 // which is encapsulated in OnSourceElementShared.  Where we have overridden a
53 // particular node's visitor, it usually means that slightly different behavior
54 // is needed for that language construct.  For example, using and const
55 // declarations respect leading blank lines if they are already there, and
56 // struct, enum, and interface declarations require leading blank lines.
57 class FormattingTreeVisitor : public DeclarationOrderTreeVisitor {
58 public:
FormattingTreeVisitor()59     FormattingTreeVisitor()
60         : last_source_location_(nullptr) {}
61 
OnInterfaceDeclaration(std::unique_ptr<InterfaceDeclaration> const & element)62     virtual void OnInterfaceDeclaration(std::unique_ptr<InterfaceDeclaration> const& element) override {
63         OnBlankLineRequiringNode();
64         TreeVisitor::OnInterfaceDeclaration(element);
65     }
66 
OnSourceElementStart(const SourceElement & element)67     virtual void OnSourceElementStart(const SourceElement& element) override {
68         OnSourceElementShared(element.start_);
69     }
70 
OnSourceElementEnd(const SourceElement & element)71     virtual void OnSourceElementEnd(const SourceElement& element) override {
72         OnSourceElementShared(element.end_);
73     }
74 
OnAttributeList(std::unique_ptr<AttributeList> const & element)75     virtual void OnAttributeList(std::unique_ptr<AttributeList> const& element) override {
76         // Disabling these in case we're in an interface method and it thinks
77         // the next line needs to be indented more.  We don't want an indent
78         // after a newline following an attribute list.  It will be reenabled by
79         // the next visited AST node in the method.
80         newline_means_indent_more_ = false;
81         interface_method_alignment_ = false;
82         // This prevents the above from being reenabled during our walk of the
83         // AttributeList
84         ScopedBool suppress(is_member_decl_, false);
85         TreeVisitor::OnAttributeList(element);
86     }
87 
OnUsing(std::unique_ptr<Using> const & element)88     virtual void OnUsing(std::unique_ptr<Using> const& element) override {
89         OnBlankLineRespectingNode();
90         ScopedBool mem(is_member_decl_);
91         TreeVisitor::OnUsing(element);
92     }
93 
OnConstDeclaration(std::unique_ptr<ConstDeclaration> const & element)94     virtual void OnConstDeclaration(std::unique_ptr<ConstDeclaration> const& element) override {
95         OnBlankLineRespectingNode();
96         ScopedBool mem(is_member_decl_);
97         TreeVisitor::OnConstDeclaration(element);
98     }
99 
OnEnumMember(std::unique_ptr<EnumMember> const & element)100     virtual void OnEnumMember(std::unique_ptr<EnumMember> const& element) override {
101         OnBlankLineRespectingNode();
102         ScopedBool mem(is_member_decl_);
103         TreeVisitor::OnEnumMember(element);
104     }
105 
OnEnumDeclaration(std::unique_ptr<EnumDeclaration> const & element)106     virtual void OnEnumDeclaration(std::unique_ptr<EnumDeclaration> const& element) override {
107         OnBlankLineRequiringNode();
108         TreeVisitor::OnEnumDeclaration(element);
109     }
110 
OnInterfaceMethod(std::unique_ptr<InterfaceMethod> const & element)111     virtual void OnInterfaceMethod(std::unique_ptr<InterfaceMethod> const& element) override {
112         interface_method_alignment_ = true;
113         interface_method_alignment_size_ = -1;
114         next_nonws_char_is_checkpoint_ = false;
115         OnBlankLineRespectingNode();
116         ScopedBool before(blank_space_before_colon_, false);
117         ScopedBool mem(is_member_decl_);
118         TreeVisitor::OnInterfaceMethod(element);
119     }
120 
OnStructDeclaration(std::unique_ptr<StructDeclaration> const & element)121     virtual void OnStructDeclaration(std::unique_ptr<StructDeclaration> const& element) override {
122         OnBlankLineRequiringNode();
123         TreeVisitor::OnStructDeclaration(element);
124     }
125 
OnStructMember(std::unique_ptr<StructMember> const & element)126     virtual void OnStructMember(std::unique_ptr<StructMember> const& element) override {
127         OnBlankLineRespectingNode();
128         ScopedBool mem(is_member_decl_);
129         TreeVisitor::OnStructMember(element);
130     }
131 
OnUnionDeclaration(std::unique_ptr<UnionDeclaration> const & element)132     virtual void OnUnionDeclaration(std::unique_ptr<UnionDeclaration> const& element) override {
133         OnBlankLineRequiringNode();
134         TreeVisitor::OnUnionDeclaration(element);
135     }
136 
OnUnionMember(std::unique_ptr<UnionMember> const & element)137     virtual void OnUnionMember(std::unique_ptr<UnionMember> const& element) override {
138         ScopedBool mem(is_member_decl_);
139         OnBlankLineRespectingNode();
140         TreeVisitor::OnUnionMember(element);
141     }
142 
OnType(std::unique_ptr<Type> const & element)143     virtual void OnType(std::unique_ptr<Type> const& element) override {
144         ScopedIncrement si(nested_type_depth_);
145         ScopedBool before_colon(blank_space_before_colon_, false);
146         ScopedBool after_colon(blank_space_after_colon_, false);
147         TreeVisitor::OnType(element);
148     }
149 
150     virtual void OnFile(std::unique_ptr<File> const& element) override;
151 
formatted_output()152     std::string* formatted_output() { return &formatted_output_; }
153 
154 private:
155     const char* last_source_location_;
156 
157     static constexpr int kIndentSpaces = 4;
158 
159     static constexpr char kWsCharacters[] = " \t\n\v\f\r";
160 
161     static constexpr char kWsCharactersNoNewline[] = " \t\v\f\r";
162 
163     std::string formatted_output_;
164 
165     // Indentations can be caused by nesting in the AST (e.g., if you've started
166     // a compound statement) or, in some situations, if there is a newline in
167     // the code (e.g., if you've decided to line break before the "->" operator
168     // in the middle of an interface definition).
169 
170     // What follows are adjustable properties.  We flip them from true to false
171     // at various points in the tree walk depending on what behavior we want.
172 
173     // When this is true, you get a blank line and indentation at the end of the
174     // segment.  This is true for top level decls that *require* blank lines:
175     // e.g., structs, unions, and interfaces.
176     bool blank_line_requiring_node_ = false;
177 
178     // When this is true, you get a blank line and indentation at the end of the
179     // segment if there is already a blank line there.  This is true for consts
180     // and using declarations.
181     bool blank_line_respecting_node_ = false;
182 
183     // When this is true, and a newline happens, you get an additional
184     // indentation.
185     bool newline_means_indent_more_ = false;
186 
187     // This is true in decl headers, but not after the ordinal in an interface
188     // method or in the element count for relevant types.
189     bool blank_space_before_colon_ = true;
190 
191     // This is true in decl headers and after the ordinal in an interface
192     // method, but not in the element count for relevant types.
193     bool blank_space_after_colon_ = true;
194 
195     // > characters require spaces after them unless you are in the middle of a
196     // parameterized type: arrays, vectors, handles, and requests.
197     int nested_type_depth_ = 0;
198 
199     // Interface methods have fancy alignment: if the last open parenthesis was
200     // at EOL, indentation is to the column that had beginning of method name +
201     // kIndentSpaces.  Otherwise, it is to the column with the last open
202     // parenthesis + 1.
203     bool interface_method_alignment_ = false;
204     int interface_method_alignment_size_ = -1;
205     int distance_from_last_newline_ = 0;
206     int offset_of_first_id_ = 0;
207     bool next_nonws_char_is_checkpoint_ = false;
208 
209     // When we complete a node and know the next thing needs to be whitespace
210     bool ws_required_next_ = false;
211 
212     // Some helper methods.
IsNonNewlineWS(char ch)213     static bool IsNonNewlineWS(char ch) {
214         return strchr(kWsCharactersNoNewline, ch) != nullptr;
215     }
216 
IsStartOfBlankLine(const std::string & str,int offset)217     static bool IsStartOfBlankLine(const std::string& str, int offset) {
218         for (int i = offset; i < str.size() && str[i] != '\n'; i++) {
219             if (!IsNonNewlineWS(str[i])) {
220                 return false;
221             }
222         }
223         return true;
224     }
225 
IsStartOfComment(std::string str,int i)226     static bool IsStartOfComment(std::string str, int i) {
227         return (i < str.size() - 1) && str[i] == '/' && str[i + 1] == '/';
228     }
229 
230     // If the string str at offset pos is the beginning of a comment, pos is
231     // modified to be the newline character at EOL.
MaybeWindPastComment(std::string str,int & pos)232     static void MaybeWindPastComment(std::string str, int& pos) {
233         if (IsStartOfComment(str, pos)) {
234             while (pos < str.size() && str[pos] != '\n') {
235                 pos++;
236             }
237         }
238     }
239 
240     // A "Segment" is a part of the source that we format - it extends from the
241     // end of the previously formatted AST node to the end of the first token
242     // represented in this AST node.
243     class Segment {
244     public:
Segment(std::string input,FormattingTreeVisitor * outer)245         Segment(std::string input, FormattingTreeVisitor* outer)
246             : output_(input), visitor_(outer) {}
247 
RemoveTrailingWS()248         void RemoveTrailingWS() {
249             std::string re("[");
250             re += kWsCharactersNoNewline;
251             re += "]+\n";
252             output_ = std::regex_replace(output_, std::regex(re), "\n");
253         }
254 
255         // Removes all of the whitespace at the beginning of every line.  Will
256         // add back indentation later.
RemoveLeadingWS()257         void RemoveLeadingWS() {
258             std::string re("\n[");
259             re += kWsCharactersNoNewline;
260             re += "]+";
261             output_ = std::regex_replace(output_, std::regex(re), "\n");
262         }
263 
264         void RemoveExtraBlankLines(bool respects_trailing_blankline);
265 
266         void InsertRequiredNewlines(bool is_top_level);
267 
268         // No non-' ' or '\n' whitespace
269         // One ws token before / after every ws-requiring character
270         // No non-newline ws before / after characters that don't want it.
271         void RegularizeSpaces(bool& ws_required_next);
272 
273         // Precondition: By now, everything should have had its leading ws
274         // stripped, and } characters are the first things on their own lines.
275         void Indent(int& current_nesting);
276 
GetOutput()277         std::string GetOutput() {
278             return output_;
279         }
280 
281     private:
RequiresWSBeforeChar(char ch)282         bool RequiresWSBeforeChar(char ch) {
283             return (ch == '{') ||
284                    (ch == '=') ||
285                    (visitor_->blank_space_before_colon_ && ch == ':');
286         }
287 
NoSpacesBeforeChar(char ch)288         bool NoSpacesBeforeChar(char ch) {
289             return NoWSBeforeChar(ch) ||
290                    (ch == ')') ||
291                    (ch == '?') ||
292                    (!visitor_->blank_space_before_colon_ && ch == ':') ||
293                    (visitor_->nested_type_depth_ > 0 && ch == '>');
294         }
295 
NoWSBeforeChar(char ch)296         bool NoWSBeforeChar(char ch) {
297             return (ch == ';');
298         }
299 
RequiresWSAfterChar(char ch)300         bool RequiresWSAfterChar(char ch) {
301             return (ch == '=') ||
302                    (ch == ',') ||
303                    (ch == '>' && (visitor_->nested_type_depth_ <= 1)) ||
304                    (ch == ':' && visitor_->blank_space_after_colon_);
305         }
306 
NoWSAfterChar(char ch)307         bool NoWSAfterChar(char ch) {
308             return (ch == ':' && !visitor_->blank_space_after_colon_) || (ch == '(');
309         }
310 
311         // As specified on the tin: erases multiple spaces from output_ at
312         // offset pos.  Optional |leave_this_many| parameter indicates number of
313         // spaces to leave; default is 1.  Optional |incl_newline| param
314         // indicates to erase newline characters as well as other WS.
315         // It returns the number of spaces that were deleted.
316         int EraseMultipleSpacesAt(int pos, int leave_this_many = 1, bool incl_newline = false);
317 
318         std::string output_;
319         FormattingTreeVisitor* visitor_;
320     };
321 
322     int current_nesting_ = 0;
323 
FormatAndPrintSegment(const std::string & segment)324     std::string FormatAndPrintSegment(const std::string& segment) {
325         Segment seg(segment, this);
326         seg.RemoveTrailingWS();
327         seg.RemoveLeadingWS();
328         seg.RemoveExtraBlankLines(blank_line_respecting_node_);
329         seg.RegularizeSpaces(ws_required_next_);
330         seg.InsertRequiredNewlines(blank_line_requiring_node_);
331         seg.Indent(current_nesting_);
332 
333         std::string output = seg.GetOutput();
334 
335         // We've respected prior blank lines for this decl (if it is a decl),
336         // and we should stop now.
337         blank_line_requiring_node_ = false;
338         blank_line_respecting_node_ = false;
339 
340         // If this was the start of a member decl, it was indented by
341         // kIndentSpaces.  Any other newlines inside the member decl should be
342         // indented additionally.
343         if (is_member_decl_) {
344             interface_method_alignment_ = true;
345             newline_means_indent_more_ = true;
346         }
347 
348         return output;
349     }
350 
OnBlankLineRequiringNode()351     void OnBlankLineRequiringNode() {
352         blank_line_requiring_node_ = true;
353         blank_line_respecting_node_ = true;
354     }
355 
OnBlankLineRespectingNode()356     void OnBlankLineRespectingNode() {
357         blank_line_respecting_node_ = true;
358     }
359 
360     bool is_member_decl_ = false;
361 
362     // str is a gap plus the next meaningful token.
363     void TrackInterfaceMethodAlignment(const std::string& str);
364 
OnSourceElementShared(const banjo::Token & current_token)365     void OnSourceElementShared(const banjo::Token& current_token) {
366         const char* ws_location = current_token.previous_end().data().data();
367         // Printed code must increase in monotonic order, for two reasons.
368         // First of all, we don't reorder anything.  Second of all, the start
369         // token for an identifier list (for example) is the same as the start
370         // token for the first identifier, so we need to make sure we don't
371         // print that token twice.
372         if (ws_location > last_source_location_) {
373             int size = (int)(current_token.data().data() - current_token.previous_end().data().data());
374             std::string gap(ws_location, size);
375             std::string content(current_token.data().data(), current_token.data().size());
376             std::string total_string = FormatAndPrintSegment(gap + content);
377             TrackInterfaceMethodAlignment(total_string);
378             formatted_output_ += total_string;
379             last_source_location_ = ws_location;
380         }
381     }
382 };
383 
384 } // namespace raw
385 } // namespace banjo
386 
387 #endif // ZIRCON_SYSTEM_HOST_BANJO_INCLUDE_BANJO_FORMATTER_H_
388