From 25c705cda6508b56ac4ab4ccd8c08a1a8f911941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johann=20Kl=C3=A4hn?= <dev@jklaehn.de> Date: Mon, 10 Jul 2017 14:44:22 +0200 Subject: [PATCH 07/12] [libclang] Add option to keep whitespace when tokenizing --- bindings/python/clang/cindex.py | 31 ++++++++++++++++++++++------- bindings/python/tests/cindex/test_cursor.py | 9 +++++++++ include/clang-c/Index.h | 30 ++++++++++++++++++++++++++-- tools/libclang/CIndex.cpp | 13 ++++++++++-- tools/libclang/libclang.exports | 1 + 5 files changed, 73 insertions(+), 11 deletions(-) diff --git a/tools/clang/bindings/python/clang/cindex.py b/tools/clang/bindings/python/clang/cindex.py index 496e1089ad..a5bb58b44f 100644 --- a/tools/clang/bindings/python/clang/cindex.py +++ b/tools/clang/bindings/python/clang/cindex.py @@ -514,6 +514,13 @@ class TokenGroup(object): You should not instantiate this class outside of this module. """ + + # Default tokenization mode. + TOKENIZE_NONE = 0 + + # Used to indicate that tokens for whitespace should be returned. + TOKENIZE_KEEP_WHITESPACE = 1 + def __init__(self, tu, memory, count): self._tu = tu self._memory = memory @@ -523,7 +530,7 @@ class TokenGroup(object): conf.lib.clang_disposeTokens(self._tu, self._memory, self._count) @staticmethod - def get_tokens(tu, extent): + def get_tokens(tu, extent, options=0): """Helper method to return all tokens in an extent. This functionality is needed multiple places in this module. We define @@ -532,8 +539,8 @@ class TokenGroup(object): tokens_memory = POINTER(Token)() tokens_count = c_uint() - conf.lib.clang_tokenize(tu, extent, byref(tokens_memory), - byref(tokens_count)) + conf.lib.clang_tokenizeRange( + tu, extent, byref(tokens_memory), byref(tokens_count), options) count = int(tokens_count.value) @@ -1801,13 +1808,16 @@ class Cursor(Structure): for descendant in child.walk_preorder(): yield descendant - def get_tokens(self): + def get_tokens(self, options=0): """Obtain Token instances formulating that compose this Cursor. This is a generator for Token instances. It returns all tokens which occupy the extent this cursor occupies. + + options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will + control tokenization behavior. """ - return TokenGroup.get_tokens(self._tu, self.extent) + return TokenGroup.get_tokens(self._tu, self.extent, options) def get_field_offsetof(self): """Returns the offsetof the FIELD_DECL pointed by this Cursor.""" @@ -2971,18 +2981,21 @@ class TranslationUnit(ClangObject): return CodeCompletionResults(ptr) return None - def get_tokens(self, locations=None, extent=None): + def get_tokens(self, locations=None, extent=None, options=0): """Obtain tokens in this translation unit. This is a generator for Token instances. The caller specifies a range of source code to obtain tokens for. The range can be specified as a 2-tuple of SourceLocation or as a SourceRange. If both are defined, behavior is undefined. + + options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will + control tokenization behavior. """ if locations is not None: extent = SourceRange(start=locations[0], end=locations[1]) - return TokenGroup.get_tokens(self, extent) + return TokenGroup.get_tokens(self, extent, options) class File(ClangObject): """ @@ -3850,6 +3863,10 @@ functionList = [ ("clang_tokenize", [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)]), + ("clang_tokenizeRange", + [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint), + c_uint]), + ("clang_visitChildren", [Cursor, callbacks['cursor_visit'], py_object], c_uint), diff --git a/tools/clang/bindings/python/tests/cindex/test_cursor.py b/tools/clang/bindings/python/tests/cindex/test_cursor.py index 3cd499ea11..2d50ec5901 100644 --- a/tools/clang/bindings/python/tests/cindex/test_cursor.py +++ b/tools/clang/bindings/python/tests/cindex/test_cursor.py @@ -3,6 +3,7 @@ import gc from clang.cindex import CursorKind from clang.cindex import TemplateArgumentKind +from clang.cindex import TokenGroup from clang.cindex import TranslationUnit from clang.cindex import TypeKind from .util import get_cursor @@ -436,6 +437,14 @@ def test_get_token_cursor(): r_cursor = t_cursor.referenced # should not raise an exception assert r_cursor.kind == CursorKind.CLASS_DECL +def test_get_tokens_with_whitespace(): + source = 'class C { void f(); }\nvoid C::f() { }' + tu = get_tu(source) + + tokens = list(tu.cursor.get_tokens(TokenGroup.TOKENIZE_KEEP_WHITESPACE)) + assert ''.join(t.spelling for t in tokens) == source + assert len(tokens) == 27 + def test_get_arguments(): tu = get_tu('void foo(int i, int j);') foo = get_cursor(tu, 'foo') diff --git a/tools/clang/include/clang-c/Index.h b/tools/clang/include/clang-c/Index.h index 402ca9a436..7fd17366ee 100644 --- a/tools/clang/include/clang-c/Index.h +++ b/tools/clang/include/clang-c/Index.h @@ -4616,6 +4616,28 @@ CINDEX_LINKAGE CXSourceLocation clang_getTokenLocation(CXTranslationUnit, */ CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken); +typedef enum { + /** + * \brief Used to indicate that no special tokenization options are needed. + */ + CXTokenize_None = 0x0, + + /** + * \brief Used to indicate that tokens for whitespace should be returned. + */ + CXTokenize_KeepWhitespace = 0x1 +} CXTokenize_Flags; + +/** + * \brief Tokenize the source code described by the given range into raw + * lexical tokens. + * + * \see clang_tokenizeRange + * + */ +CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range, + CXToken **Tokens, unsigned *NumTokens); + /** * \brief Tokenize the source code described by the given range into raw * lexical tokens. @@ -4632,9 +4654,13 @@ CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken); * \param NumTokens will be set to the number of tokens in the \c *Tokens * array. * + * \param options A bitmask of options that affects tokenization. This should be + * a bitwise OR of the CXTokenize_XXX flags. + * */ -CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range, - CXToken **Tokens, unsigned *NumTokens); +CINDEX_LINKAGE void clang_tokenizeRange(CXTranslationUnit TU, + CXSourceRange Range, CXToken **Tokens, + unsigned *NumTokens, unsigned options); /** * \brief Annotate the given set of tokens by providing cursors for each token diff --git a/tools/clang/tools/libclang/CIndex.cpp b/tools/clang/tools/libclang/CIndex.cpp index 621bc42076..04fd775fb0 100644 --- a/tools/clang/tools/libclang/CIndex.cpp +++ b/tools/clang/tools/libclang/CIndex.cpp @@ -6292,7 +6292,7 @@ CXSourceRange clang_getTokenExtent(CXTranslationUnit TU, CXToken CXTok) { } static void getTokens(ASTUnit *CXXUnit, SourceRange Range, - SmallVectorImpl<CXToken> &CXTokens) { + SmallVectorImpl<CXToken> &CXTokens, unsigned options) { SourceManager &SourceMgr = CXXUnit->getSourceManager(); std::pair<FileID, unsigned> BeginLocInfo = SourceMgr.getDecomposedSpellingLoc(Range.getBegin()); @@ -6314,6 +6314,9 @@ static void getTokens(ASTUnit *CXXUnit, SourceRange Range, CXXUnit->getASTContext().getLangOpts(), Buffer.begin(), Buffer.data() + BeginLocInfo.second, Buffer.end()); Lex.SetCommentRetentionState(true); + if (options & CXTokenize_KeepWhitespace) { + Lex.SetKeepWhitespaceMode(true); + } // Lex tokens until we hit the end of the range. const char *EffectiveBufferEnd = Buffer.data() + EndLocInfo.second; @@ -6365,6 +6368,12 @@ static void getTokens(ASTUnit *CXXUnit, SourceRange Range, void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range, CXToken **Tokens, unsigned *NumTokens) { + return clang_tokenizeRange(TU, Range, Tokens, NumTokens, CXTokenize_None); +} + +void clang_tokenizeRange(CXTranslationUnit TU, CXSourceRange Range, + CXToken **Tokens, unsigned *NumTokens, + unsigned options) { LOG_FUNC_SECTION { *Log << TU << ' ' << Range; } @@ -6390,7 +6399,7 @@ void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range, return; SmallVector<CXToken, 32> CXTokens; - getTokens(CXXUnit, R, CXTokens); + getTokens(CXXUnit, R, CXTokens, options); if (CXTokens.empty()) return; diff --git a/tools/clang/tools/libclang/libclang.exports b/tools/clang/tools/libclang/libclang.exports index 9c56e88052..b8e3df23ef 100644 --- a/tools/clang/tools/libclang/libclang.exports +++ b/tools/clang/tools/libclang/libclang.exports @@ -316,6 +316,7 @@ clang_suspendTranslationUnit clang_sortCodeCompletionResults clang_toggleCrashRecovery clang_tokenize +clang_tokenizeRange clang_CompilationDatabase_fromDirectory clang_CompilationDatabase_dispose clang_CompilationDatabase_getCompileCommands -- 2.13.0