From 075a7a3e667fe3d923de6d7a6929e61922c8b139 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johann=20Kl=C3=A4hn?= <johann@jklaehn.de>
Date: Fri, 3 Nov 2017 21:19:51 +0100
Subject: [PATCH 3/5] [libclang] Add option to keep whitespace when tokenizing

Introduces new `clang_tokenizeRange` function which accepts options to control
tokenization behavior.  `clang_tokenize` is kept for backwards compatibility.
---
 clang/bindings/python/clang/cindex.py         | 31 ++++++++++++++----
 .../python/tests/cindex/test_cursor.py        |  9 ++++++
 clang/include/clang-c/Index.h                 | 32 +++++++++++++++++--
 clang/tools/libclang/CIndex.cpp               | 15 +++++++--
 clang/tools/libclang/libclang.exports         |  1 +
 5 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/tools/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index c309f7017b2..1589acc9e7e 100644
--- a/tools/clang/bindings/python/clang/cindex.py
+++ b/tools/clang/bindings/python/clang/cindex.py
@@ -529,6 +529,13 @@ class TokenGroup(object):
 
     You should not instantiate this class outside of this module.
     """
+
+    # Default tokenization mode.
+    TOKENIZE_NONE = 0
+
+    # Used to indicate that tokens for whitespace should be returned.
+    TOKENIZE_KEEP_WHITESPACE = 1
+
     def __init__(self, tu, memory, count):
         self._tu = tu
         self._memory = memory
@@ -538,7 +545,7 @@ class TokenGroup(object):
         conf.lib.clang_disposeTokens(self._tu, self._memory, self._count)
 
     @staticmethod
-    def get_tokens(tu, extent):
+    def get_tokens(tu, extent, options=0):
         """Helper method to return all tokens in an extent.
 
         This functionality is needed multiple places in this module. We define
@@ -547,8 +554,8 @@ class TokenGroup(object):
         tokens_memory = POINTER(Token)()
         tokens_count = c_uint()
 
-        conf.lib.clang_tokenize(tu, extent, byref(tokens_memory),
-                byref(tokens_count))
+        conf.lib.clang_tokenizeRange(
+            tu, extent, byref(tokens_memory), byref(tokens_count), options)
 
         count = int(tokens_count.value)
 
@@ -1852,13 +1859,16 @@ class Cursor(Structure):
             for descendant in child.walk_preorder():
                 yield descendant
 
-    def get_tokens(self):
+    def get_tokens(self, options=0):
         """Obtain Token instances formulating that compose this Cursor.
 
         This is a generator for Token instances. It returns all tokens which
         occupy the extent this cursor occupies.
+
+        options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will
+        control tokenization behavior.
         """
-        return TokenGroup.get_tokens(self._tu, self.extent)
+        return TokenGroup.get_tokens(self._tu, self.extent, options)
 
     def get_field_offsetof(self):
         """Returns the offsetof the FIELD_DECL pointed by this Cursor."""
@@ -3080,18 +3090,21 @@ class TranslationUnit(ClangObject):
             return CodeCompletionResults(ptr)
         return None
 
-    def get_tokens(self, locations=None, extent=None):
+    def get_tokens(self, locations=None, extent=None, options=0):
         """Obtain tokens in this translation unit.
 
         This is a generator for Token instances. The caller specifies a range
         of source code to obtain tokens for. The range can be specified as a
         2-tuple of SourceLocation or as a SourceRange. If both are defined,
         behavior is undefined.
+
+        options is a bitwise or of TokenGroup.TOKENIZE_XXX flags which will
+        control tokenization behavior.
         """
         if locations is not None:
             extent = SourceRange(start=locations[0], end=locations[1])
 
-        return TokenGroup.get_tokens(self, extent)
+        return TokenGroup.get_tokens(self, extent, options)
 
 class File(ClangObject):
     """
@@ -3969,6 +3982,10 @@ functionList = [
   ("clang_tokenize",
    [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)]),
 
+  ("clang_tokenizeRange",
+   [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint),
+    c_uint]),
+
   ("clang_visitChildren",
    [Cursor, callbacks['cursor_visit'], py_object],
    c_uint),
diff --git a/tools/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
index 6a53c7205df..0965c1f4ae1 100644
--- a/tools/clang/bindings/python/tests/cindex/test_cursor.py
+++ b/tools/clang/bindings/python/tests/cindex/test_cursor.py
@@ -10,6 +10,7 @@ import unittest
 from clang.cindex import AvailabilityKind
 from clang.cindex import CursorKind
 from clang.cindex import TemplateArgumentKind
+from clang.cindex import TokenGroup
 from clang.cindex import TranslationUnit
 from clang.cindex import TypeKind
 from .util import get_cursor
@@ -488,6 +489,14 @@ class TestCursor(unittest.TestCase):
         self.assertEqual(tokens[0].spelling, 'int')
         self.assertEqual(tokens[1].spelling, 'foo')
 
+    def test_get_tokens_with_whitespace():
+        source = 'class C { void f(); }\nvoid C::f() { }'
+        tu = get_tu(source)
+
+        tokens = list(tu.cursor.get_tokens(TokenGroup.TOKENIZE_KEEP_WHITESPACE))
+        self.assertEqual(''.join(t.spelling for t in tokens), source)
+        self.assertEqual(len(tokens), 27, [t.spelling for t in tokens])
+
     def test_get_token_cursor(self):
         """Ensure we can map tokens to cursors."""
         tu = get_tu('class A {}; int foo(A var = A());', lang='cpp')
diff --git a/tools/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index b0c62fe948e..84ed03b8920 100644
--- a/tools/clang/include/clang-c/Index.h
+++ b/tools/clang/include/clang-c/Index.h
@@ -32,7 +32,7 @@
  * compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
  */
 #define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 62
+#define CINDEX_VERSION_MINOR 63
 
 #define CINDEX_VERSION_ENCODE(major, minor) ( \
       ((major) * 10000)                       \
@@ -4969,6 +4969,28 @@ CINDEX_LINKAGE CXSourceLocation clang_getTokenLocation(CXTranslationUnit,
  */
 CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken);
 
+typedef enum {
+  /**
+   * \brief Used to indicate that no special tokenization options are needed.
+   */
+  CXTokenize_None = 0x0,
+
+  /**
+   * \brief Used to indicate that tokens for whitespace should be returned.
+   */
+  CXTokenize_KeepWhitespace = 0x1
+} CXTokenize_Flags;
+
+/**
+ * \brief Tokenize the source code described by the given range into raw
+ * lexical tokens.
+ *
+ * \see clang_tokenizeRange
+ *
+ */
+CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
+                                   CXToken **Tokens, unsigned *NumTokens);
+
 /**
  * Tokenize the source code described by the given range into raw
  * lexical tokens.
@@ -4985,9 +5007,13 @@ CINDEX_LINKAGE CXSourceRange clang_getTokenExtent(CXTranslationUnit, CXToken);
  * \param NumTokens will be set to the number of tokens in the \c *Tokens
  * array.
  *
+ * \param options A bitmask of options that affects tokenization. This should be
+ * a bitwise OR of the CXTokenize_XXX flags.
+ *
  */
-CINDEX_LINKAGE void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
-                                   CXToken **Tokens, unsigned *NumTokens);
+CINDEX_LINKAGE void clang_tokenizeRange(CXTranslationUnit TU,
+                                        CXSourceRange Range, CXToken **Tokens,
+                                        unsigned *NumTokens, unsigned options);
 
 /**
  * Annotate the given set of tokens by providing cursors for each token
diff --git a/tools/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 1dc961f58a2..3a283e76ed8 100644
--- a/tools/clang/tools/libclang/CIndex.cpp
+++ b/tools/clang/tools/libclang/CIndex.cpp
@@ -6670,7 +6670,7 @@ CXSourceRange clang_getTokenExtent(CXTranslationUnit TU, CXToken CXTok) {
 }
 
 static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
-                      SmallVectorImpl<CXToken> &CXTokens) {
+                      SmallVectorImpl<CXToken> &CXTokens, unsigned options) {
   SourceManager &SourceMgr = CXXUnit->getSourceManager();
   std::pair<FileID, unsigned> BeginLocInfo
     = SourceMgr.getDecomposedSpellingLoc(Range.getBegin());
@@ -6692,6 +6692,9 @@ static void getTokens(ASTUnit *CXXUnit, SourceRange Range,
             CXXUnit->getASTContext().getLangOpts(),
             Buffer.begin(), Buffer.data() + BeginLocInfo.second, Buffer.end());
   Lex.SetCommentRetentionState(true);
+  if (options & CXTokenize_KeepWhitespace) {
+    Lex.SetKeepWhitespaceMode(true);
+  }
 
   // Lex tokens until we hit the end of the range.
   const char *EffectiveBufferEnd = Buffer.data() + EndLocInfo.second;
@@ -6765,7 +6768,7 @@ CXToken *clang_getToken(CXTranslationUnit TU, CXSourceLocation Location) {
   SourceLocation End = SM.getComposedLoc(DecomposedEnd.first, DecomposedEnd.second);
 
   SmallVector<CXToken, 32> CXTokens;
-  getTokens(CXXUnit, SourceRange(Begin, End), CXTokens);
+  getTokens(CXXUnit, SourceRange(Begin, End), CXTokens, CXTokenize_None);
 
   if (CXTokens.empty())
     return NULL;
@@ -6913,6 +6913,12 @@ CXToken *clang_getToken(CXTranslationUni
 
 void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range, CXToken **Tokens,
                     unsigned *NumTokens) {
+  return clang_tokenizeRange(TU, Range, Tokens, NumTokens, CXTokenize_None);
+}
+
+void clang_tokenizeRange(CXTranslationUnit TU, CXSourceRange Range,
+                         CXToken **Tokens, unsigned *NumTokens,
+                         unsigned options) {
   LOG_FUNC_SECTION { *Log << TU << ' ' << Range; }
 
   if (Tokens)
@@ -6804,7 +6813,7 @@ void clang_tokenize(CXTranslationUnit TU, CXSourceRange Range,
     return;
 
   SmallVector<CXToken, 32> CXTokens;
-  getTokens(CXXUnit, R, CXTokens);
+  getTokens(CXXUnit, R, CXTokens, options);
 
   if (CXTokens.empty())
     return;
diff --git a/tools/clang/tools/libclang/libclang.exports b/clang/tools/libclang/libclang.exports
index 6e860e7263e..6af6c0ca3e8 100644
--- a/tools/clang/tools/libclang/libclang.exports
+++ b/tools/clang/tools/libclang/libclang.exports
@@ -338,6 +338,7 @@ clang_suspendTranslationUnit
 clang_sortCodeCompletionResults
 clang_toggleCrashRecovery
 clang_tokenize
+clang_tokenizeRange
 clang_CompilationDatabase_fromDirectory
 clang_CompilationDatabase_dispose
 clang_CompilationDatabase_getCompileCommands
-- 
2.23.0