Skip to content
Snippets Groups Projects
llvm5-0007-libclang-Add-option-to-keep-whitespace-when-tokenizi.patch 9.33 KiB
Newer Older
From 25c705cda6508b56ac4ab4ccd8c08a1a8f911941 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johann=20Kl=C3=A4hn?= <dev@jklaehn.de>
Date: Mon, 10 Jul 2017 14:44:22 +0200
Subject: [PATCH 07/12] [libclang] Add option to keep whitespace when
 tokenizing

---
 bindings/python/clang/cindex.py             | 31 ++++++++++++++++++++++-------
 bindings/python/tests/cindex/test_cursor.py |  9 +++++++++
 include/clang-c/Index.h                     | 30 ++++++++++++++++++++++++++--
 tools/libclang/CIndex.cpp                   | 13 ++++++++++--
 tools/libclang/libclang.exports             |  1 +
 5 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/tools/clang/bindings/python/clang/cindex.py b/tools/clang/bindings/python/clang/cindex.py
index 496e1089ad..a5bb58b44f 100644
--- a/tools/clang/bindings/python/clang/cindex.py
+++ b/tools/clang/bindings/python/clang/cindex.py
@@ -514,6 +514,13 @@ class TokenGroup(object):
 
     You should not instantiate this class outside of this module.
     """
+
+    # Default tokenization mode.
+    TOKENIZE_NONE = 0
+
+    # Used to indicate that tokens for whitespace should be returned.
+    TOKENIZE_KEEP_WHITESPACE = 1
+
     def __init__(self, tu, memory, count):
         self._tu = tu
         self._memory = memory
@@ -523,7 +530,7 @@ class TokenGroup(object):
         conf.lib.clang_disposeTokens(self._tu, self._memory, self._count)
 
     @staticmethod
-    def get_tokens(tu, extent):
+    def get_tokens(tu, extent, options=0):
         """Helper method to return all tokens in an extent.
 
         This functionality is needed multiple places in this module. We define
@@ -532,8 +539,8 @@ class TokenGroup(object):
         tokens_memory = POINTER(Token)()
         tokens_count = c_uint()
 
-        conf.lib.clang_tokenize(tu, extent, byref(tokens_memory),
-                byref(tokens_count))
+        conf.lib.clang_tokenizeRange(
+            tu, extent, byref(tokens_memory), byref(tokens_count), options)
 
         count = int(tokens_count.value)
 
@@ -1801,13 +1808,16 @@ class Cursor(Structure):
             for descendant in child.walk_preorder():
                 yield descendant
 
-    def get_tokens(self):
+    def get_tokens(self, options=0):
         """Obtain Token instances formulating that compose this Cursor.
 
         This is a generator for Token instances. It returns all tokens which
         occupy the extent this cursor occupies.
+
+        options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will
+        control tokenization behavior.
         """
-        return TokenGroup.get_tokens(self._tu, self.extent)
+        return TokenGroup.get_tokens(self._tu, self.extent, options)
 
     def get_field_offsetof(self):
         """Returns the offsetof the FIELD_DECL pointed by this Cursor."""
@@ -2971,18 +2981,21 @@ class TranslationUnit(ClangObject):
             return CodeCompletionResults(ptr)
         return None
 
-    def get_tokens(self, locations=None, extent=None):
+    def get_tokens(self, locations=None, extent=None, options=0):
         """Obtain tokens in this translation unit.
 
         This is a generator for Token instances. The caller specifies a range
         of source code to obtain tokens for. The range can be specified as a
         2-tuple of SourceLocation or as a SourceRange. If both are defined,
         behavior is undefined.
+
+        options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will
+        control tokenization behavior.
         """
         if locations is not None:
             extent = SourceRange(start=locations[0], end=locations[1])
 
-        return TokenGroup.get_tokens(self, extent)
+        return TokenGroup.get_tokens(self, extent, options)
 
 class File(ClangObject):
     """
@@ -3850,6 +3863,10 @@ functionList = [
   ("clang_tokenize",
    [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)]),
 
+  ("clang_tokenizeRange",
+   [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint),
+    c_uint]),
+
   ("clang_visitChildren",
    [Cursor, callbacks['cursor_visit'], py_object],
    c_uint),
diff --git a/tools/clang/bindings/python/tests/cindex/test_cursor.py b/tools/clang/bindings/python/tests/cindex/test_cursor.py
index 3cd499ea11..2d50ec5901 100644
--- a/tools/clang/bindings/python/tests/cindex/test_cursor.py
+++ b/tools/clang/bindings/python/tests/cindex/test_cursor.py
@@ -3,6 +3,7 @@ import gc
 
 from clang.cindex import CursorKind
 from clang.cindex import TemplateArgumentKind
+from clang.cindex import TokenGroup
 from clang.cindex import TranslationUnit
 from clang.cindex import TypeKind
 from .util import get_cursor
@@ -436,6 +437,14 @@ def test_get_token_cursor():
     r_cursor = t_cursor.referenced # should not raise an exception
     assert r_cursor.kind == CursorKind.CLASS_DECL
 
+def test_get_tokens_with_whitespace():
+    source = 'class C { void f(); }\nvoid C::f() { }'
+    tu = get_tu(source)
+
+    tokens = list(tu.cursor.get_tokens(TokenGroup.TOKENIZE_KEEP_WHITESPACE))
+    assert ''.join(t.spelling for t in tokens) == source
+    assert len(tokens) == 27
+
 def test_get_arguments():
     tu = get_tu('void foo(int i, int j);')
     foo = get_cursor(tu, 'foo')
diff --git a/tools/clang/include/clang-c/Index.h b/tools/clang/include/clang-c/Index.h
index 402ca9a436..7fd17366ee 100644
--- a/tools/clang/include/clang-c/Index.h
+++ b/tools/clang/include/clang-c/Index.h
@@ -4616,6 +4616,28 @@ CINDEX_LINKAGE CXSourceLocation clang_getTokenLocation(CXTranslationUnit,
  */
 CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken);
 
+typedef enum {
+  /**
+   * \brief Used to indicate that no special tokenization options are needed.
+   */
+  CXTokenize_None = 0x0,
+
+  /**
+   * \brief Used to indicate that tokens for whitespace should be returned.
+   */
+  CXTokenize_KeepWhitespace = 0x1
+} CXTokenize_Flags;
+
+/**
+ * \brief Tokenize the source code described by the given range into raw
+ * lexical tokens.
+ *
+ * \see clang_tokenizeRange
+ *
+ */
+CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
+                                   CXToken **Tokens, unsigned *NumTokens);
+
 /**
  * \brief Tokenize the source code described by the given range into raw
  * lexical tokens.
@@ -4632,9 +4654,13 @@ CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken);
  * \param NumTokens will be set to the number of tokens in the \c *Tokens
  * array.
  *
+ * \param options A bitmask of options that affects tokenization. This should be
+ * a bitwise OR of the CXTokenize_XXX flags.
+ *
  */
-CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
-                                   CXToken **Tokens, unsigned *NumTokens);
+CINDEX_LINKAGE void clang_tokenizeRange(CXTranslationUnit TU,
+                                        CXSourceRange Range, CXToken **Tokens,
+                                        unsigned *NumTokens, unsigned options);
 
 /**
  * \brief Annotate the given set of tokens by providing cursors for each token
diff --git a/tools/clang/tools/libclang/CIndex.cpp b/tools/clang/tools/libclang/CIndex.cpp
index 621bc42076..04fd775fb0 100644
--- a/tools/clang/tools/libclang/CIndex.cpp
+++ b/tools/clang/tools/libclang/CIndex.cpp
@@ -6292,7 +6292,7 @@ CXSourceRange clang_getTokenExtent(CXTranslationUnit TU, CXToken CXTok) {
 }
 
 static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
-                      SmallVectorImpl<CXToken> &CXTokens) {
+                      SmallVectorImpl<CXToken> &CXTokens, unsigned options) {
   SourceManager &SourceMgr = CXXUnit->getSourceManager();
   std::pair<FileID, unsigned> BeginLocInfo
     = SourceMgr.getDecomposedSpellingLoc(Range.getBegin());
@@ -6314,6 +6314,9 @@ static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
             CXXUnit->getASTContext().getLangOpts(),
             Buffer.begin(), Buffer.data() + BeginLocInfo.second, Buffer.end());
   Lex.SetCommentRetentionState(true);
+  if (options & CXTokenize_KeepWhitespace) {
+    Lex.SetKeepWhitespaceMode(true);
+  }
 
   // Lex tokens until we hit the end of the range.
   const char *EffectiveBufferEnd = Buffer.data() + EndLocInfo.second;
@@ -6365,6 +6368,12 @@ static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
 
 void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
                     CXToken **Tokens, unsigned *NumTokens) {
+  return clang_tokenizeRange(TU, Range, Tokens, NumTokens, CXTokenize_None);
+}
+
+void clang_tokenizeRange(CXTranslationUnit TU, CXSourceRange Range,
+                         CXToken **Tokens, unsigned *NumTokens,
+                         unsigned options) {
   LOG_FUNC_SECTION {
     *Log << TU << ' ' << Range;
   }
@@ -6390,7 +6399,7 @@ void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
     return;
 
   SmallVector<CXToken, 32> CXTokens;
-  getTokens(CXXUnit, R, CXTokens);
+  getTokens(CXXUnit, R, CXTokens, options);
 
   if (CXTokens.empty())
     return;
diff --git a/tools/clang/tools/libclang/libclang.exports b/tools/clang/tools/libclang/libclang.exports
index 9c56e88052..b8e3df23ef 100644
--- a/tools/clang/tools/libclang/libclang.exports
+++ b/tools/clang/tools/libclang/libclang.exports
@@ -316,6 +316,7 @@ clang_suspendTranslationUnit
 clang_sortCodeCompletionResults
 clang_toggleCrashRecovery
 clang_tokenize
+clang_tokenizeRange
 clang_CompilationDatabase_fromDirectory
 clang_CompilationDatabase_dispose
 clang_CompilationDatabase_getCompileCommands
-- 
2.13.0