// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
#define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
#include <cstdint>
#include "common/ostream.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/raw_ostream.h"
#include "toolchain/base/index_base.h"
#include "toolchain/base/mem_usage.h"
#include "toolchain/base/shared_value_stores.h"
#include "toolchain/diagnostics/diagnostic_emitter.h"
#include "toolchain/lex/token_index.h"
#include "toolchain/lex/token_info.h"
#include "toolchain/lex/token_kind.h"
#include "toolchain/source/source_buffer.h"
namespace Carbon::Lex {
class TokenizedBuffer;
struct LineInfo {
 explicit LineInfo(int32_t start) : start(start), indent(0) {}
 // Zero-based byte offset of the start of the line within the source buffer
 // provided.
 int32_t start;
 // The byte offset from the start of the line of the first non-whitespace
 // character.
 int32_t indent;
};
// A lightweight handle to a lexed `LineInfo` in a `TokenizedBuffer`.
//
// `LineIndex` objects are designed to be passed by value, not reference or
// pointer. They are also designed to be small and efficient to store in data
// structures.
//
// Each `LineIndex` object refers to a specific line in the source code that was
// lexed. They can be compared directly to establish that they refer to the
// same line or the relative position of different lines within the source.
//
// All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
struct LineIndex : public IndexBase<lineindex> {
 static constexpr llvm::StringLiteral Label = "line";
 static const LineIndex None;
 using IndexBase::IndexBase;
};
constexpr LineIndex LineIndex::None(NoneIndex);
// A comment, which can be a block of lines. These are tracked separately from
// tokens because they don't affect parse; if they were part of tokens, we'd
// need more general special-casing within token logic.
//
// Note that `CommentInfo` is used for an API to expose the comment.
struct CommentData {
 // Zero-based byte offset of the start of the comment within the source
 // buffer provided.
 int32_t start;
 // The comment's length.
 int32_t length;
};
// Indices for `CommentData` within the buffer.
struct CommentIndex : public IndexBase<commentindex> {
 static constexpr llvm::StringLiteral Label = "comment";
 static const CommentIndex None;
 using IndexBase::IndexBase;
};
constexpr CommentIndex CommentIndex::None(NoneIndex);
// Random-access iterator over comments within the buffer.
using CommentIterator = IndexIterator<commentindex>;
// Random-access iterator over tokens within the buffer.
using TokenIterator = IndexIterator<tokenindex>;
// A token range which is inclusive of the begin and end.
struct InclusiveTokenRange {
 TokenIndex begin;
 TokenIndex end;
};
// A buffer of tokenized Carbon source code.
//
// This is constructed by lexing the source code text into a series of tokens.
// The buffer provides lightweight handles to tokens and other lexed entities,
// as well as iterations to walk the sequence of tokens found in the buffer.
//
// Lexing errors result in a potentially incomplete sequence of tokens and
// `HasError` returning true.
class TokenizedBuffer : public Printable<tokenizedbuffer> {
 public:
 // A comment, which can be a block of lines.
 //
 // This is the API version of `CommentData`.
 struct CommentInfo {
 // The comment's full text, including `//` symbols. This may have several
 // lines for block comments.
 llvm::StringRef text;
 // The comment's indent.
 int32_t indent;
 // The first line of the comment.
 LineIndex start_line;
 };
 auto GetKind(TokenIndex token) const -> TokenKind;
 auto GetLine(TokenIndex token) const -> LineIndex;
 // Returns the 1-based line number.
 auto GetLineNumber(TokenIndex token) const -> int;
 // Returns the 1-based column number.
 auto GetColumnNumber(TokenIndex token) const -> int;
 // Returns the line and 1-based column number of the first character after
 // this token.
 auto GetEndLoc(TokenIndex token) const -> std::pair<lineindex, int>;
 // Returns the source text lexed into this token.
 auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
 // Returns the identifier associated with this token. The token kind must be
 // an `Identifier`.
 auto GetIdentifier(TokenIndex token) const -> IdentifierId;
 // Returns the value of an `IntLiteral` token.
 auto GetIntLiteral(TokenIndex token) const -> IntId;
 // Returns the value of an `RealLiteral` token.
 auto GetRealLiteral(TokenIndex token) const -> RealId;
 // Returns the value of a `StringLiteral` token.
 auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
 // Returns the value of a `CharLiteral` token.
 auto GetCharLiteralValue(TokenIndex token) const -> CharLiteralValue;
 // Returns the size specified in a `*TypeLiteral` token.
 auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
 // Returns the closing token matched with the given opening token.
 //
 // The given token must be an opening token kind.
 auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
 // Returns the opening token matched with the given closing token.
 //
 // The given token must be a closing token kind.
 auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
 // Returns whether the given token has leading whitespace.
 auto HasLeadingWhitespace(TokenIndex token) const -> bool;
 // Returns whether the given token has trailing whitespace.
 auto HasTrailingWhitespace(TokenIndex token) const -> bool;
 // Returns whether the token was created as part of an error recovery effort.
 //
 // For example, a closing paren inserted to match an unmatched paren.
 auto IsRecoveryToken(TokenIndex token) const -> bool;
 // Returns the 1-based indentation column number.
 auto GetIndentColumnNumber(LineIndex line) const -> int;
 auto GetByteOffset(TokenIndex token) const -> int32_t {
 return token_infos_.Get(token).byte_offset();
 }
 // Returns true if the token comes after the comment.
 auto IsAfterComment(TokenIndex token, CommentIndex comment_index) const
 -> bool;
 // Returns the comment's full text range.
 auto GetCommentText(CommentIndex comment_index) const -> llvm::StringRef;
 // Returns tokens as YAML. This prints the tracked token information on a
 // single line for each token. We use the single-line format so that output is
 // compact, and so that tools like `grep` are compatible.
 //
 // An example token looks like:
 //
 // - { index: 1, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
 auto Print(llvm::raw_ostream& out,
 bool omit_file_boundary_tokens = false) const -> void;
 // Prints a description of a single token. See `Print` for details on the
 // format.
 auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
 -> void;
 // Collects memory usage of members.
 auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
 -> void;
 // Converts a token to a diagnostic location.
 auto TokenToDiagnosticLoc(TokenIndex token) const
 -> Diagnostics::ConvertedLoc;
 // Returns true if the given range overlaps with an entry in
 // `dump_sem_ir_ranges_`. Must not be called when there are no ranges; query
 // `has_dump_sem_ir_ranges` first.
 auto OverlapsWithDumpSemIRRange(Lex::InclusiveTokenRange range) const -> bool;
 // Returns true if the buffer has errors that were detected at lexing time.
 auto has_errors() const -> bool { return has_errors_; }
 auto tokens() const -> llvm::iterator_range<tokeniterator> {
 return llvm::make_range(TokenIterator(TokenIndex(0)),
 TokenIterator(TokenIndex(token_infos_.size())));
 }
 auto size() const -> int { return token_infos_.size(); }
 auto comments() const -> llvm::iterator_range<commentiterator> {
 return llvm::make_range(CommentIterator(CommentIndex(0)),
 CommentIterator(CommentIndex(comments_.size())));
 }
 auto comments_size() const -> size_t { return comments_.size(); }
 auto has_include_in_dumps() const -> bool { return has_include_in_dumps_; }
 // Returns true if any `DumpSemIRRange`s were provided.
 auto has_dump_sem_ir_ranges() const -> bool {
 return !dump_sem_ir_ranges_.empty();
 }
 // This is an upper bound on the number of output parse nodes in the absence
 // of errors.
 auto expected_max_parse_tree_size() const -> int {
 return expected_max_parse_tree_size_;
 }
 auto source() const -> const SourceBuffer& { return *source_; }
 private:
 friend class Lexer;
 class SourcePointerDiagnosticEmitter
 : public Diagnostics::Emitter<const char*> {
 public:
 explicit SourcePointerDiagnosticEmitter(Diagnostics::Consumer* consumer,
 const TokenizedBuffer* tokens)
 : Emitter(consumer), tokens_(tokens) {}
 protected:
 auto ConvertLoc(const char* loc, ContextFnT /*context_fn*/) const
 -> Diagnostics::ConvertedLoc override {
 return tokens_->SourcePointerToDiagnosticLoc(loc);
 }
 private:
 const TokenizedBuffer* tokens_;
 };
 class TokenDiagnosticEmitter : public Diagnostics::Emitter<tokenindex> {
 public:
 explicit TokenDiagnosticEmitter(Diagnostics::Consumer* consumer,
 const TokenizedBuffer* tokens)
 : Emitter(consumer), tokens_(tokens) {}
 protected:
 auto ConvertLoc(TokenIndex token, ContextFnT /*context_fn*/) const
 -> Diagnostics::ConvertedLoc override {
 return tokens_->TokenToDiagnosticLoc(token);
 }
 private:
 const TokenizedBuffer* tokens_;
 };
 // Converts a pointer into the source to a diagnostic location.
 auto SourcePointerToDiagnosticLoc(const char* loc) const
 -> Diagnostics::ConvertedLoc;
 // Specifies minimum widths to use when printing a token's fields via
 // `printToken`.
 struct PrintWidths {
 // Widens `this` to the maximum of `this` and `new_width` for each
 // dimension.
 auto Widen(const PrintWidths& widths) -> void;
 int index;
 int kind;
 int line;
 int column;
 int indent;
 };
 // The constructor is merely responsible for trivial initialization of
 // members. A working object of this type is built with `Lex::Lex` so that its
 // return can indicate if an error was encountered while lexing.
 explicit TokenizedBuffer(SharedValueStores& value_stores
 [[clang::lifetimebound]],
 SourceBuffer& source [[clang::lifetimebound]])
 : value_stores_(&value_stores), source_(&source) {}
 auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
 // Adds the token and adjusts the expected tree size.
 auto AddToken(TokenInfo info) -> TokenIndex;
 auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
 auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
 PrintWidths widths) const -> void;
 // Adds a comment. This uses the indent to potentially stitch together two
 // adjacent comments.
 auto AddComment(int32_t indent, int32_t start, int32_t end) -> void;
 // Used to allocate computed string literals.
 llvm::BumpPtrAllocator allocator_;
 SharedValueStores* value_stores_;
 SourceBuffer* source_;
 ValueStore<tokenindex, TokenInfo> token_infos_;
 ValueStore<lineindex, LineInfo> line_infos_;
 // Comments in the file.
 ValueStore<commentindex, CommentData> comments_;
 // Whether SemIR dumping is explicitly enabled for this file. This is marked
 // by `//@include-in-dumps`, and overrides other file-inclusion selection
 // choices. It can be combined with ranges.
 bool has_include_in_dumps_ = false;
 // A range of tokens marked by `//@dump-sem-ir-[begin&#124;end]`.
 //
 // The particular syntax was chosen because it can be lexed efficiently. It
 // only occurs in invalid comment strings, so shouldn't slow down lexing of
 // correct code. It's also comment-like because its presence won't affect
 // parse/check.
 llvm::SmallVector<inclusivetokenrange> dump_sem_ir_ranges_;
 // An upper bound on the number of parse tree nodes that we expect to be
 // created for the tokens in this buffer.
 int expected_max_parse_tree_size_ = 0;
 bool has_errors_ = false;
 // A vector of flags for recovery tokens. If empty, there are none. When doing
 // token recovery, this will be extended to be indexable by token indices and
 // contain true for the tokens that were synthesized for recovery.
 llvm::BitVector recovery_tokens_;
};
inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
 return token_infos_.Get(token).kind();
}
inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
 -> bool {
 return token_infos_.Get(token).has_leading_space();
}
inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
 -> bool {
 TokenIterator it(token);
 ++it;
 return it != tokens().end() && token_infos_.Get(*it).has_leading_space();
}
inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
 expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
 return token_infos_.Add(info);
}
} // namespace Carbon::Lex
#endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
</div><div class="naked_ctrl">
<form action="/index.cgi/contrast" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="http://docs.carbon-lang.dev/toolchain/lex/tokenized_buffer.h">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="http://docs.carbon-lang.dev/toolchain/lex/tokenized_buffer.h" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast" selected="selected">配色反転</option>
<option value="larger-text">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>