-
Eric Müller authored272da1a7
llvm11_1-0003-libclang-Add-option-to-keep-whitespace-when-tokenizi.patch 10.27 KiB
From 075a7a3e667fe3d923de6d7a6929e61922c8b139 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johann=20Kl=C3=A4hn?= <johann@jklaehn.de>
Date: Fri, 3 Nov 2017 21:19:51 +0100
Subject: [PATCH 3/5] [libclang] Add option to keep whitespace when tokenizing
Introduces new `clang_tokenizeRange` function which accepts options to control
tokenization behavior. `clang_tokenize` is kept for backwards compatibility.
---
clang/bindings/python/clang/cindex.py | 31 ++++++++++++++----
.../python/tests/cindex/test_cursor.py | 9 ++++++
clang/include/clang-c/Index.h | 32 +++++++++++++++++--
clang/tools/libclang/CIndex.cpp | 15 +++++++--
clang/tools/libclang/libclang.exports | 1 +
5 files changed, 75 insertions(+), 13 deletions(-)
diff --git a/tools/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index c309f7017b2..1589acc9e7e 100644
--- a/tools/clang/bindings/python/clang/cindex.py
+++ b/tools/clang/bindings/python/clang/cindex.py
@@ -529,6 +529,13 @@ class TokenGroup(object):
You should not instantiate this class outside of this module.
"""
+
+ # Default tokenization mode.
+ TOKENIZE_NONE = 0
+
+ # Used to indicate that tokens for whitespace should be returned.
+ TOKENIZE_KEEP_WHITESPACE = 1
+
def __init__(self, tu, memory, count):
self._tu = tu
self._memory = memory
@@ -538,7 +545,7 @@ class TokenGroup(object):
conf.lib.clang_disposeTokens(self._tu, self._memory, self._count)
@staticmethod
- def get_tokens(tu, extent):
+ def get_tokens(tu, extent, options=0):
"""Helper method to return all tokens in an extent.
This functionality is needed multiple places in this module. We define
@@ -547,8 +554,8 @@ class TokenGroup(object):
tokens_memory = POINTER(Token)()
tokens_count = c_uint()
- conf.lib.clang_tokenize(tu, extent, byref(tokens_memory),
- byref(tokens_count))
+ conf.lib.clang_tokenizeRange(
+ tu, extent, byref(tokens_memory), byref(tokens_count), options)
count = int(tokens_count.value)
@@ -1852,13 +1859,16 @@ class Cursor(Structure):
for descendant in child.walk_preorder():
yield descendant
- def get_tokens(self):
+ def get_tokens(self, options=0):
"""Obtain Token instances formulating that compose this Cursor.
This is a generator for Token instances. It returns all tokens which
occupy the extent this cursor occupies.
+
+ options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will
+ control tokenization behavior.
"""
- return TokenGroup.get_tokens(self._tu, self.extent)
+ return TokenGroup.get_tokens(self._tu, self.extent, options)
def get_field_offsetof(self):
"""Returns the offsetof the FIELD_DECL pointed by this Cursor."""
@@ -3080,18 +3090,21 @@ class TranslationUnit(ClangObject):
return CodeCompletionResults(ptr)
return None
- def get_tokens(self, locations=None, extent=None):
+ def get_tokens(self, locations=None, extent=None, options=0):
"""Obtain tokens in this translation unit.
This is a generator for Token instances. The caller specifies a range
of source code to obtain tokens for. The range can be specified as a
2-tuple of SourceLocation or as a SourceRange. If both are defined,
behavior is undefined.
+
+ options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will
+ control tokenization behavior.
"""
if locations is not None:
extent = SourceRange(start=locations[0], end=locations[1])
- return TokenGroup.get_tokens(self, extent)
+ return TokenGroup.get_tokens(self, extent, options)
class File(ClangObject):
"""
@@ -3969,6 +3982,10 @@ functionList = [
("clang_tokenize",
[TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)]),
+ ("clang_tokenizeRange",
+ [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint),
+ c_uint]),
+
("clang_visitChildren",
[Cursor, callbacks['cursor_visit'], py_object],
c_uint),
diff --git a/tools/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
index 6a53c7205df..0965c1f4ae1 100644
--- a/tools/clang/bindings/python/tests/cindex/test_cursor.py
+++ b/tools/clang/bindings/python/tests/cindex/test_cursor.py
@@ -10,6 +10,7 @@ import unittest
from clang.cindex import AvailabilityKind
from clang.cindex import CursorKind
from clang.cindex import TemplateArgumentKind
+from clang.cindex import TokenGroup
from clang.cindex import TranslationUnit
from clang.cindex import TypeKind
from .util import get_cursor
@@ -488,6 +489,14 @@ class TestCursor(unittest.TestCase):
self.assertEqual(tokens[0].spelling, 'int')
self.assertEqual(tokens[1].spelling, 'foo')
+ def test_get_tokens_with_whitespace():
+ source = 'class C { void f(); }\nvoid C::f() { }'
+ tu = get_tu(source)
+
+ tokens = list(tu.cursor.get_tokens(TokenGroup.TOKENIZE_KEEP_WHITESPACE))
+ self.assertEqual(''.join(t.spelling for t in tokens), source)
+ self.assertEqual(len(tokens), 27, [t.spelling for t in tokens])
+
def test_get_token_cursor(self):
"""Ensure we can map tokens to cursors."""
tu = get_tu('class A {}; int foo(A var = A());', lang='cpp')
diff --git a/tools/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index b0c62fe948e..84ed03b8920 100644
--- a/tools/clang/include/clang-c/Index.h
+++ b/tools/clang/include/clang-c/Index.h
@@ -32,7 +32,7 @@
* compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
*/
#define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 62
+#define CINDEX_VERSION_MINOR 63
#define CINDEX_VERSION_ENCODE(major, minor) ( \
((major) * 10000) \
@@ -4969,6 +4969,28 @@ CINDEX_LINKAGE CXSourceLocation clang_getTokenLocation(CXTranslationUnit,
*/
CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken);
+typedef enum {
+ /**
+ * \brief Used to indicate that no special tokenization options are needed.
+ */
+ CXTokenize_None = 0x0,
+
+ /**
+ * \brief Used to indicate that tokens for whitespace should be returned.
+ */
+ CXTokenize_KeepWhitespace = 0x1
+} CXTokenize_Flags;
+
+/**
+ * \brief Tokenize the source code described by the given range into raw
+ * lexical tokens.
+ *
+ * \see clang_tokenizeRange
+ *
+ */
+CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
+ CXToken **Tokens, unsigned *NumTokens);
+
/**
* Tokenize the source code described by the given range into raw
* lexical tokens.
@@ -4985,9 +5007,13 @@ CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken);
* \param NumTokens will be set to the number of tokens in the \c *Tokens
* array.
*
+ * \param options A bitmask of options that affects tokenization. This should be
+ * a bitwise OR of the CXTokenize_XXX flags.
+ *
*/
-CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
- CXToken **Tokens, unsigned *NumTokens);
+CINDEX_LINKAGE void clang_tokenizeRange(CXTranslationUnit TU,
+ CXSourceRange Range, CXToken **Tokens,
+ unsigned *NumTokens, unsigned options);
/**
* Annotate the given set of tokens by providing cursors for each token
diff --git a/tools/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 1dc961f58a2..3a283e76ed8 100644
--- a/tools/clang/tools/libclang/CIndex.cpp
+++ b/tools/clang/tools/libclang/CIndex.cpp
@@ -6670,7 +6670,7 @@ CXSourceRange clang_getTokenExtent(CXTranslationUnit TU, CXToken CXTok) {
}
static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
- SmallVectorImpl<CXToken> &CXTokens) {
+ SmallVectorImpl<CXToken> &CXTokens, unsigned options) {
SourceManager &SourceMgr = CXXUnit->getSourceManager();
std::pair<FileID, unsigned> BeginLocInfo
= SourceMgr.getDecomposedSpellingLoc(Range.getBegin());
@@ -6692,6 +6692,9 @@ static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
CXXUnit->getASTContext().getLangOpts(),
Buffer.begin(), Buffer.data() + BeginLocInfo.second, Buffer.end());
Lex.SetCommentRetentionState(true);
+ if (options & CXTokenize_KeepWhitespace) {
+ Lex.SetKeepWhitespaceMode(true);
+ }
// Lex tokens until we hit the end of the range.
const char *EffectiveBufferEnd = Buffer.data() + EndLocInfo.second;
@@ -6765,7 +6768,7 @@ CXToken *clang_getToken(CXTranslationUnit TU, CXSourceLocation Location) {
SourceLocation End = SM.getComposedLoc(DecomposedEnd.first, DecomposedEnd.second);
SmallVector<CXToken, 32> CXTokens;
- getTokens(CXXUnit, SourceRange(Begin, End), CXTokens);
+ getTokens(CXXUnit, SourceRange(Begin, End), CXTokens, CXTokenize_None);
if (CXTokens.empty())
return NULL;
@@ -6913,6 +6913,12 @@ CXToken *clang_getToken(CXTranslationUni
void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range, CXToken **Tokens,
unsigned *NumTokens) {
+ return clang_tokenizeRange(TU, Range, Tokens, NumTokens, CXTokenize_None);
+}
+
+void clang_tokenizeRange(CXTranslationUnit TU, CXSourceRange Range,
+ CXToken **Tokens, unsigned *NumTokens,
+ unsigned options) {
LOG_FUNC_SECTION { *Log << TU << ' ' << Range; }
if (Tokens)
@@ -6804,7 +6813,7 @@ void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
return;
SmallVector<CXToken, 32> CXTokens;
- getTokens(CXXUnit, R, CXTokens);
+ getTokens(CXXUnit, R, CXTokens, options);
if (CXTokens.empty())
return;
diff --git a/tools/clang/tools/libclang/libclang.exports b/clang/tools/libclang/libclang.exports
index 6e860e7263e..6af6c0ca3e8 100644
--- a/tools/clang/tools/libclang/libclang.exports
+++ b/tools/clang/tools/libclang/libclang.exports
@@ -338,6 +338,7 @@ clang_suspendTranslationUnit
clang_sortCodeCompletionResults
clang_toggleCrashRecovery
clang_tokenize
+clang_tokenizeRange
clang_CompilationDatabase_fromDirectory
clang_CompilationDatabase_dispose
clang_CompilationDatabase_getCompileCommands
--
2.23.0