From ff1c06d835bb1957e84b9edf8a688ab1ff7fab65 Mon Sep 17 00:00:00 2001
From: Nora Abi Akar <nora.abiakar@gmail.com>
Date: Fri, 6 Nov 2020 13:38:22 +0100
Subject: [PATCH] Check trailing data in SWC  (#1218)

* Remove blank line from `ball_and_stick.swc`
* Mention that blank lines in SWC files indicate end of data.
* Check for trailing data after parsing in the python interface.
* Leave stream in good state if `parse_swc` encounters an empty line indicating end of data.
---
 arborio/swcio.cpp                |  2 +-
 doc/concepts/morphology.rst      |  9 +++--
 python/morphology.cpp            | 19 +++++++--
 test/unit/swc/ball_and_stick.swc |  1 -
 test/unit/test_swcio.cpp         | 69 ++++++++++++++++++++++++++++++++
 5 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/arborio/swcio.cpp b/arborio/swcio.cpp
index 9bf47ea0..89c5ea31 100644
--- a/arborio/swcio.cpp
+++ b/arborio/swcio.cpp
@@ -183,7 +183,7 @@ swc_data parse_swc(std::istream& in) {
     }
 
     swc_record r;
-    while (in && in >> r) {
+    while (in && (in.peek() != '\n') && in >> r) {
         records.push_back(r);
     }
 
diff --git a/doc/concepts/morphology.rst b/doc/concepts/morphology.rst
index accf2568..ed4b1393 100644
--- a/doc/concepts/morphology.rst
+++ b/doc/concepts/morphology.rst
@@ -494,10 +494,11 @@ SWC
 
 Arbor supports reading morphologies described using the
 `SWC <http://www.neuronland.org/NLMorphologyConverter/MorphologyFormats/SWC/Spec.html>`_ file format.
-SWC files describe the morphology as a list of samples with an id, an `x,y,z` location is space, a radius, a tag
-and a parent id. Arbor parses these samples, performs some checks, then generates a
-:ref:`segment tree <morph-segment_tree>` describing the morphology according to one of three possible
-interpretations.
+SWC files may contain comments, which are stored as metadata. A blank line anywhere in the file is
+interpreted as end of data. The description of the morphology is encoded as a list of samples with an id,
+an `x,y,z` location in space, a radius, a tag and a parent id. Arbor parses these samples, performs some checks,
+then generates a :ref:`segment tree <morph-segment_tree>` describing the morphology according to one of three
+possible interpretations.
 
 The SWC file format specifications are not very detailed, which has lead different simulators to interpret
 SWC files in different ways, especially when it comes to the soma. Arbor has its own an interpretation that
diff --git a/python/morphology.cpp b/python/morphology.cpp
index dba6f594..c7de23ac 100644
--- a/python/morphology.cpp
+++ b/python/morphology.cpp
@@ -14,6 +14,12 @@
 
 namespace pyarb {
 
+void check_trailing(std::istream& in, std::string fname) {
+    if (!(in >> std::ws).eof()) {
+        throw pyarb_error(util::pprintf("Trailing data found at end of file '{}'", fname));
+    }
+}
+
 void register_morphology(pybind11::module& m) {
     using namespace pybind11::literals;
 
@@ -142,7 +148,9 @@ void register_morphology(pybind11::module& m) {
                 throw pyarb_error(util::pprintf("can't open file '{}'", fname));
             }
             try {
-                return arborio::load_swc_arbor(arborio::parse_swc(fid));
+                auto data = arborio::parse_swc(fid);
+                check_trailing(fid, fname);
+                return arborio::load_swc_arbor(data);
             }
             catch (arborio::swc_error& e) {
                 // Try to produce helpful error messages for SWC parsing errors.
@@ -166,7 +174,10 @@ void register_morphology(pybind11::module& m) {
                 throw pyarb_error(util::pprintf("can't open file '{}'", fname));
             }
             try {
-                return arborio::load_swc_allen(arborio::parse_swc(fid), no_gaps);
+                auto data = arborio::parse_swc(fid);
+                check_trailing(fid, fname);
+                return arborio::load_swc_allen(data, no_gaps);
+
             }
             catch (arborio::swc_error& e) {
                 // Try to produce helpful error messages for SWC parsing errors.
@@ -199,7 +210,9 @@ void register_morphology(pybind11::module& m) {
                 throw pyarb_error(util::pprintf("can't open file '{}'", fname));
             }
             try {
-                return arborio::load_swc_neuron(arborio::parse_swc(fid));
+                auto data = arborio::parse_swc(fid);
+                check_trailing(fid, fname);
+                return arborio::load_swc_neuron(data);
             }
             catch (arborio::swc_error& e) {
                 // Try to produce helpful error messages for SWC parsing errors.
diff --git a/test/unit/swc/ball_and_stick.swc b/test/unit/swc/ball_and_stick.swc
index 29839867..2faf5138 100644
--- a/test/unit/swc/ball_and_stick.swc
+++ b/test/unit/swc/ball_and_stick.swc
@@ -1,7 +1,6 @@
 # ball and stick model with
 #   - soma with radius 12.6157/2
 #   - dendrite with length 200 and radius 0.5
-
 1 1     0.0     0.0     0.0     6.30785 -1
 2 3     6.30785 0.0     0.0     0.5      1
 3 3   206.30785 0.0     0.0     0.5      2
diff --git a/test/unit/test_swcio.cpp b/test/unit/test_swcio.cpp
index d836bc0d..06c6397b 100644
--- a/test/unit/test_swcio.cpp
+++ b/test/unit/test_swcio.cpp
@@ -182,6 +182,75 @@ TEST(swc_parser, valid_parse) {
         swc_data data2 = parse_swc(valid4);
         EXPECT_EQ(data.records(), data2.records());
     }
+}
+
+TEST(swc_parser, stream_validity) {
+    {
+        std::string valid =
+                "# metadata\n"
+                "1 1 0.1 0.2 0.3 0.4 -1\n"
+                "2 1 0.1 0.2 0.3 0.4 1\n";
+
+        std::istringstream is(valid);
+
+        auto data = parse_swc(is);
+        ASSERT_EQ(2u, data.records().size());
+        EXPECT_TRUE(data.metadata() == "metadata\n");
+        EXPECT_TRUE(is.eof());
+    }
+    {
+        std::string valid =
+                "# metadata\n"
+                "\n"
+                "1 1 0.1 0.2 0.3 0.4 -1\n"
+                "2 1 0.1 0.2 0.3 0.4 1\n";
+
+        std::istringstream is(valid);
+
+        auto data = parse_swc(is);
+        ASSERT_EQ(0u, data.records().size());
+        EXPECT_TRUE(data.metadata() == "metadata\n");
+        EXPECT_TRUE(is.good());
+
+        is >> std::ws;
+        data = parse_swc(is);
+        ASSERT_EQ(2u, data.records().size());
+        EXPECT_TRUE(data.metadata().empty());
+        EXPECT_TRUE(is.eof());
+    }
+    {
+        std::string invalid =
+                "# metadata\n"
+                "1 1 0.1 0.2 0.3 \n"
+                "2 1 0.1 0.2 0.3 0.4 1\n";
+
+        std::istringstream is(invalid);
+
+        auto data = parse_swc(is);
+        ASSERT_EQ(0u, data.records().size());
+        EXPECT_TRUE(data.metadata() == "metadata\n");
+        EXPECT_FALSE(is.good());
+    }
+    {
+        std::string invalid =
+                "# metadata\n"
+                "1 1 0.1 0.2 0.3 \n"
+                "2 1 0.1 0.2 0.3 0.4 1\n";
+
+        std::istringstream is(invalid);
+
+        auto data = parse_swc(is);
+        EXPECT_TRUE(data.metadata() == "metadata\n");
+        ASSERT_EQ(0u, data.records().size());
+        EXPECT_TRUE(data.metadata() == "metadata\n");
+        EXPECT_FALSE(is.good());
+
+        is >> std::ws;
+        data = parse_swc(is);
+        ASSERT_EQ(0u, data.records().size());
+        EXPECT_TRUE(data.metadata().empty());
+        EXPECT_FALSE(is.good());
+    }
 
 }
 
-- 
GitLab