diff --git a/build.gradle b/build.gradle
index fc8e3f2..8978717 100644
--- a/build.gradle
+++ b/build.gradle
@@ -123,6 +123,9 @@ dependencies {
     implementation "org.apache.lucene:lucene-analyzers-common:4.5.1"
     implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0'
 
+    testImplementation "org.xmlunit:xmlunit-matchers:2.10.0"
+    testImplementation "org.xmlunit:xmlunit-legacy:2.10.0"
+
     implementation 'org.slf4j:slf4j-api:1.7.30'
     implementation 'ch.qos.logback:logback-classic:1.2.3'
     implementation "com.rockymadden.stringmetric:stringmetric-core_2.10:0.27.3"
diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java
index 9b431a5..253c9ec 100644
--- a/src/main/java/org/pub2tei/document/XMLUtilities.java
+++ b/src/main/java/org/pub2tei/document/XMLUtilities.java
@@ -1,26 +1,35 @@
 package org.pub2tei.document;
 
-import java.io.*;
-import java.util.*;
-import javax.xml.parsers.*;
-import javax.xml.transform.*;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-import javax.xml.namespace.NamespaceContext;
-import javax.xml.xpath.*;
-
 import net.sf.saxon.om.NameChecker;
-
-import org.w3c.dom.*;
-import org.xml.sax.*;
-
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.SentenceUtilities;
-
-import org.apache.commons.io.FileUtils;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.w3c.dom.*;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXParseException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Pattern;
 
 /**
  *  Some convenient methods for suffering a bit less with XML
@@ -222,11 +231,20 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                  (textualElements.contains(n.getNodeName())) ) {
 
                 // text content
-                StringBuffer textBuffer = new StringBuffer();
+                StringBuilder textBuffer = new StringBuilder();
                 NodeList childNodes = n.getChildNodes();
                 for(int y=0; y<childNodes.getLength(); y++) {
-                    textBuffer.append(serialize(doc, childNodes.item(y)));
-                    textBuffer.append(" ");
+                    Node item = childNodes.item(y);
+                    String serializedString = serialize(doc, item);
+                    if (y > 0 && StringUtils.isNotEmpty(serializedString)) {
+                        String firstChar = "" + serializedString.charAt(0);
+                        //We might need to use TextUtilities.fullPunctuation
+                        if (!Pattern.matches("\\p{Punct}", firstChar)) {
+                            textBuffer.append(" ");
+                        }
+                    }
+
+                    textBuffer.append(serializedString);
                 }
                 String text = textBuffer.toString();
                 List<OffsetPosition> theSentenceBoundaries = SentenceUtilities.getInstance().runSentenceDetection(text);
@@ -238,8 +256,8 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     //System.out.println("new chunk: " + sent);
                     String sent = text.substring(sentPos.start, sentPos.end);
                     String newSent = sent;
-                    if (toConcatenate.size() != 0) {
-                        StringBuffer conc = new StringBuffer();
+                    if (CollectionUtils.isNotEmpty(toConcatenate)) {
+                        StringBuilder conc = new StringBuilder();
                         for(String concat : toConcatenate) {
                             conc.append(concat);
                             conc.append(" ");
diff --git a/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java b/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java
new file mode 100644
index 0000000..91645fd
--- /dev/null
+++ b/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java
@@ -0,0 +1,96 @@
+package org.pub2tei.document;
+
+import com.sun.tools.javac.util.List;
+import org.grobid.core.main.GrobidHomeFinder;
+import org.grobid.core.utilities.GrobidProperties;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.InputSource;
+import org.xmlunit.matchers.CompareMatcher;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.io.InputStream;
+import java.io.StringReader;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+
+public class XMLUtilitiesIntegrationTest {
+
+    @Before
+    public void setUp() throws Exception {
+        //This test requires to have grobid deployed somewhere under these directories
+        GrobidHomeFinder finder = new GrobidHomeFinder(List.of("../grobid-home", "../../grobid/grobid-home"));
+        GrobidProperties.getInstance(finder);
+    }
+
+    @Test
+    public void testSegment_chunk_shouldInjectSegmentCorrectly() throws Exception {
+        String input = "<div type=\"acknowledgement\">" +
+                "<div xmlns=\"http://www.tei-c.org/ns/1.0\">" +
+                "<head>Acknowledgements</head>" +
+                "<p>Our warmest thanks to Patrice Lopez, the author of Grobid <ref type=\"bibr\" target=\"#b21\">[22]</ref>, DeLFT <ref type=\"bibr\" target=\"#b19\">[20]</ref>, and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.</p>" +
+                "</div>" +
+                "</div>";
+
+        String expected = "<div type=\"acknowledgement\">\n" +
+                "\t<div xmlns=\"http://www.tei-c.org/ns/1.0\">\n" +
+                "\t\t<head>Acknowledgements</head>\n" +
+                "\t\t<p>\n" +
+                "\t\t\t<s>Our warmest thanks to Patrice Lopez, the author of Grobid <ref type=\"bibr\" target=\"#b21\">[22]</ref>, DeLFT <ref type=\"bibr\" target=\"#b19\">[20]</ref>, and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.</s>\n" +
+                "\t\t\t<s>We thank Pedro Baptista de Castro for his support during this work.</s>\n" +
+                "\t\t\t<s>Special thanks to Erina Fujita for useful tips on the manuscript.</s>\n" +
+                "\t\t</p>\n" +
+                "\t</div>\n" +
+                "</div>";
+
+
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        factory.setNamespaceAware(false);
+        DocumentBuilder builder = factory.newDocumentBuilder();
+
+        org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(input)));
+
+        XMLUtilities.segment(document, document.getDocumentElement());
+
+        assertThat(XMLUtilities.serialize(document, document.getDocumentElement()), CompareMatcher.isIdenticalTo(expected.replace("\t","   ")));
+    }
+
+
+    @Test
+    public void testSegment_document1_shouldInjectSegmentCorrectly() throws Exception {
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        factory.setNamespaceAware(false);
+        DocumentBuilder builder = factory.newDocumentBuilder();
+
+        InputStream resourceAsStream = this.getClass().getResourceAsStream("document1.tei.xml");
+        org.w3c.dom.Document document = builder.parse(new InputSource(resourceAsStream));
+
+        InputStream resourceAsStreamSegmented = this.getClass().getResourceAsStream("document1.tei.xml");
+        org.w3c.dom.Document documentSegmented = builder.parse(new InputSource(resourceAsStreamSegmented));
+
+        XMLUtilities.segment(document, document.getDocumentElement());
+        String documentResult = XMLUtilities.serialize(document, document.getDocumentElement());
+        String documentExpected = XMLUtilities.serialize(documentSegmented, document.getDocumentElement());
+        assertThat(documentResult, CompareMatcher.isIdenticalTo(documentExpected));
+    }
+
+    @Test
+    public void testSegment_document2_shouldInjectSegmentCorrectly() throws Exception {
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        factory.setNamespaceAware(false);
+        DocumentBuilder builder = factory.newDocumentBuilder();
+
+        InputStream resourceAsStream = this.getClass().getResourceAsStream("document2.tei.xml");
+        org.w3c.dom.Document document = builder.parse(new InputSource(resourceAsStream));
+
+        InputStream resourceAsStreamSegmented = this.getClass().getResourceAsStream("document2.segmented.tei.xml");
+        org.w3c.dom.Document documentSegmented = builder.parse(new InputSource(resourceAsStreamSegmented));
+
+        XMLUtilities.segment(document, document.getDocumentElement());
+        String documentResult = XMLUtilities.serialize(document, document.getDocumentElement());
+        String documentExpected = XMLUtilities.serialize(documentSegmented, document.getDocumentElement());
+        assertThat(documentResult, CompareMatcher.isIdenticalTo(documentExpected));
+    }
+
+}
\ No newline at end of file
diff --git a/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml b/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml
new file mode 100644
index 0000000..aade875
--- /dev/null
+++ b/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml
@@ -0,0 +1,1413 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"     xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"     xmlns:xlink="http://www.w3.org/1999/xlink">
+  <teiHeader xml:lang="en">
+    <fileDesc>
+      <titleStmt>
+        <title level="a" type="main">Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos</title>
+        <funder ref="#_JUak8EW">
+          <orgName type="full">NASA</orgName>
+        </funder>
+        <funder ref="#_dR9zdmy">
+          <orgName                            type="full">Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan</orgName>
+        </funder>
+        <funder ref="#_37TZA8z">
+          <orgName type="full">NASA Carbon Monitoring System</orgName>
+        </funder>
+        <funder ref="#_6cR6r98">
+          <orgName type="full">unknown</orgName>
+        </funder>
+      </titleStmt>
+      <publicationStmt>
+        <publisher/>
+        <availability status="unknown">
+          <licence/>
+        </availability>
+        <date type="published" when="2023-10-13">13 October 2023</date>
+      </publicationStmt>
+      <sourceDesc>
+        <biblStruct>
+          <analytic>
+            <author role="corresp">
+              <persName>
+                <forename type="first">Shijuan</forename>
+                <surname>Chen</surname>
+              </persName>
+              <email>shijuan.chen@yale.edu</email>
+              <affiliation key="aff0">
+                <orgName type="institution" key="instit1">Yale School of the Environment</orgName>
+                <orgName type="institution" key="instit2">Yale University</orgName>
+                <address>
+                  <settlement>New Haven</settlement>
+                  <region>CT</region>
+                  <country key="US">United States of America</country>
+                </address>
+              </affiliation>
+              <affiliation key="aff1">
+                <orgName type="department">Department of Earth and Environment</orgName>
+                <orgName type="institution">Boston University</orgName>
+                <address>
+                  <settlement>Boston</settlement>
+                  <region>MA</region>
+                  <country key="US">United States of America</country>
+                </address>
+              </affiliation>
+            </author>
+            <author>
+              <persName>
+                <forename type="first">Curtis</forename>
+                <forename type="middle">E</forename>
+                <surname>Woodcock</surname>
+              </persName>
+              <affiliation key="aff1">
+                <orgName type="department">Department of Earth and Environment</orgName>
+                <orgName type="institution">Boston University</orgName>
+                <address>
+                  <settlement>Boston</settlement>
+                  <region>MA</region>
+                  <country key="US">United States of America</country>
+                </address>
+              </affiliation>
+            </author>
+            <author>
+              <persName>
+                <forename type="first">Thatheva</forename>
+                <surname>Saphangthong</surname>
+              </persName>
+              <affiliation key="aff2">
+                <orgName type="department"                                         key="dep1">Department of Agriculture Land Management</orgName>
+                <orgName type="department" key="dep2">Ministry of Agriculture and Forestry</orgName>
+                <address>
+                  <settlement>Vientiane</settlement>
+                  <country>Laos</country>
+                </address>
+              </affiliation>
+            </author>
+            <author>
+              <persName>
+                <forename type="first">Pontus</forename>
+                <surname>Olofsson</surname>
+              </persName>
+              <affiliation key="aff1">
+                <orgName type="department">Department of Earth and Environment</orgName>
+                <orgName type="institution">Boston University</orgName>
+                <address>
+                  <settlement>Boston</settlement>
+                  <region>MA</region>
+                  <country key="US">United States of America</country>
+                </address>
+              </affiliation>
+              <affiliation key="aff3">
+                <orgName type="institution">NASA Marshall Space Flight Center</orgName>
+                <address>
+                  <settlement>Huntsville</settlement>
+                  <region>AL</region>
+                  <country key="US">United States of America</country>
+                </address>
+              </affiliation>
+            </author>
+            <title level="a" type="main">Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos</title>
+          </analytic>
+          <monogr>
+            <imprint>
+              <date type="published" when="2023-10-13">13 October 2023</date>
+            </imprint>
+          </monogr>
+          <idno type="MD5">17112CCE7BFA5F63FB9BFE897A9E1A85</idno>
+          <idno type="DOI">10.1088/1748-9326/acffdd</idno>
+        </biblStruct>
+      </sourceDesc>
+    </fileDesc>
+    <encodingDesc>
+      <appInfo>
+        <application version="project.version" ident="GROBID" when="2024-04-26T10:22+0000">
+          <desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
+          <ref target="https://github.com/kermitt2/grobid"/>
+        </application>
+      </appInfo>
+    </encodingDesc>
+    <profileDesc>
+      <textClass>
+        <keywords>
+          <term>shifting cultivation</term>
+          <term>shifting agriculture</term>
+          <term>slash and burn</term>
+          <term>swidden agriculture</term>
+          <term>forest degradation</term>
+          <term>carbon emissions</term>
+          <term>deforestation</term>
+        </keywords>
+      </textClass>
+      <abstract>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <p>
+            <s>Although shifting cultivation is the major land use type in Laos, the spatial-temporal patterns and the associated carbon emissions of shifting cultivation in Laos are largely unknown.</s>
+            <s>This study provides a nationwide analysis of the spatial-temporal patterns of shifting cultivation and estimations of the associated carbon emissions in Laos over the last three decades.</s>
+            <s>This study found that shifting cultivation has been expanding and intensifying in Laos, especially in the last 5 years.</s>
+            <s>The newly cultivated land from 2016 to 2020 accounted for 4.5% (±1.2%) of the total land area of Laos.</s>
+            <s>Furthermore, the length of fallow periods has been continuously declining, indicating that shifting cultivation is becoming increasingly intensive.</s>
+            <s>Combining biomass derived from Global Ecosystem Dynamics Investigation and shifting cultivation maps and area estimates, we found that the net carbon emissions from shifting cultivation declined in 2001-2015 but increased in 2016-2020.</s>
+            <s>The largest carbon source is conversion from intact forests to shifting cultivation, which contributed to 89% of the total emissions from 2001 to 2020.</s>
+            <s>In addition, there were increased emissions from intensified use of fallow lands.</s>
+            <s>This research provides useful information for policymakers in Laos to understand the changes in shifting cultivation and improve land use management.</s>
+            <s>This study not only supports Reducing Emissions from Deforestation and Forest Degradation reporting for Laos but also provides a methodology for tracking carbon emissions and removals of shifting cultivation.</s>
+          </p>
+        </div>
+      </abstract>
+    </profileDesc>
+  </teiHeader>
+  <text xml:lang="en">
+    <body>
+      <div xmlns="http://www.tei-c.org/ns/1.0">
+        <head n="1.">Introduction</head>
+        <p>
+          <s>Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation.</s>
+          <s>It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation.</s>
+          <s>After short-term cultivation, the plot is abandoned, which allows the vegetation to recover.</s>
+          <s>Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries <ref                type="bibr">(Heinimann et al 2017</ref>
+          <ref type="bibr" target="#b7">, Curtis et al 2018</ref>
+          <ref type="bibr">, Jiang et al 2022)</ref>, such as Laos <ref type="bibr"                                                                          target="#b5">(Chen et al 2023)</ref>, and the            Democratic Republic of Congo <ref type="bibr">(Molinario et al 2015)</ref>.</s>
+          <s>Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small.</s>
+          <s>Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce.</s>
+        </p>
+        <p>
+          <s>Shifting cultivation has both short-term and long-term effects on carbon emissions <ref type="bibr">(Ziegler et al 2012)</ref>.</s>
+          <s>In the short term, the slash-and-burn activities cause immediate release of carbon.</s>
+          <s>In the long term, encroachment of shifting cultivation into primary forest and intensified use of secondary forest both lead to long-term increases in net carbon emissions and degradation of ecosystems.</s>
+          <s>Carbon emissions from shifting cultivation have not been well quantified, because of the lack of methodology for monitoring shifting cultivation and tracking the associated carbon dynamics.</s>
+          <s>In contrast to deforestation (such as urbanization), which does not involve carbon sequestration, shifting cultivation involves both carbon emissions associated with slash-and-burn activities and carbon sequestration during the fallow period.</s>
+          <s>Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting.</s>
+        </p>
+        <p>
+          <s>In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system <ref            type="bibr">(Roder 2001</ref>
+          <ref type="bibr" target="#b10">, Douangsavanh et al 2006</ref>
+          <ref type="bibr"                                                                                                            target="#b12">, Epprecht et al 2018</ref>
+          <ref            type="bibr">, Manivong and Cramb 2020)</ref> and the major driver of forest dynamics        <ref type="bibr" target="#b7">(Curtis et al 2018</ref>
+          <ref type="bibr" target="#b5">, Chen et al 2023)</ref>.</s>
+          <s>It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years <ref                type="bibr" target="#b5">(Chen et al 2023)</ref>.</s>
+          <s>Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020.</s>
+          <s>Shifting cultivation activities are expected to increase due to the increasing demand for rice.</s>
+          <s>Monitoring shifting cultivation and analyzing its patterns are important to understand the forest cover change in Laos and relevant to achieving Laos' goal of increasing forest cover to 70% (The current forest cover is 62%) (The Government of Lao PDR 2005).</s>
+          <s>Since there were few spatially and temporally explicit maps and estimates of shifting cultivation before <ref                type="bibr" target="#b5">Chen et al (2023)</ref>, carbon emissions from shifting cultivation have not            been accurately estimated in the REDD+ reporting of Laos (Department of Forestry, Ministry of Agriculture            and Forestry, Lao PDR 2018).</s>
+        </p>
+        <p>
+          <s>Spatially and temporally explicit information about shifting cultivation in Laos was unavailable until recently <ref            type="bibr" target="#b5">(Chen et al 2023)</ref>, and a comprehensive national-scale analysis of the spatial        and temporal patterns of shifting cultivation has not been conducted to date.</s>
+          <s>A traditional approach for mapping shifting cultivation is to create landscape mosaics based on a land cover map of a single year <ref                type="bibr">(Messerli et al 2009</ref>
+          <ref type="bibr">, Silva et al 2011</ref>
+          <ref type="bibr">, Hett et al 2012</ref>
+          <ref type="bibr">, Hurni et al 2013a)</ref>.</s>
+          <s>It is impossible to analyze the temporal patterns of shifting cultivation using this traditional approach.</s>
+          <s>Another approach is to use multi-temporal land cover data to map shifting cultivation <ref type="bibr">(Leisz and Rasmussen 2012</ref>
+          <ref type="bibr">, Molinario et al 2015</ref>
+          <ref type="bibr">, Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018</ref>
+          <ref type="bibr" target="#b0">, Adhikary et al 2019</ref>
+          <ref type="bibr">, Kurien et al 2019)</ref>.</s>
+          <s>In previous studies, the temporal resolution of the land cover maps was not high enough to support the analysis of temporal patterns <ref                type="bibr">(Heinimann et al 2013)</ref>.</s>
+          <s>Recently, <ref type="bibr" target="#b5">Chen et al (2023)</ref> used satellite data to create shifting            cultivation products for Laos with sufficient temporal frequency (annual) and spatial resolution (30 m) to            support a nationalscale spatial-temporal analysis.</s>
+          <s>The recently launched GEDI (Global Ecosystem Dynamics Investigation) mission provides new opportunities for estimating biomass at a large scale <ref                type="bibr">(Tang et al 2020)</ref>.</s>
+        </p>
+        <p>
+          <s>This study used the map products and reference data in <ref type="bibr" target="#b5">Chen et al (2023)</ref>,        combined with GEDI, to conduct a national-scale analysis of the spatial and temporal patterns and carbon        dynamics of shifting cultivation in Laos.</s>
+          <s>The goal is to understand the spatial and temporal patterns of shifting cultivation and the associated carbon emissions, in support of decision-making to reduce carbon emissions and promote sustainable livelihoods depending on shifting cultivation.</s>
+        </p>
+      </div>
+      <div xmlns="http://www.tei-c.org/ns/1.0">
+        <head n="2.">Method</head>
+      </div>
+      <div xmlns="http://www.tei-c.org/ns/1.0">
+        <head n="2.1.">Mapping shifting cultivation</head>
+        <p>
+          <s>Shifting cultivation was mapped using Landsat data from 1987 to 2020 on Google Earth Engine <ref                        type="bibr" target="#b5">(Chen et al 2023)</ref>.</s>
+          <s>CCDC-SMA (continuous change detection and classification-spectral mixture analysis) (Zhu and Woodcock 2014, Bullock et al 2020, Chen et al 2021) was used to detect forest disturbances in Laos.</s>
+          <s>CCDC-SMA fits harmonic models to fractions of endmembers and NDFI (Normalized Difference Fraction Index) <ref                            type="bibr">(Souza et al 2005)</ref> to monitor forest disturbances (figure <ref                            type="figure" target="#fig_0">1</ref>).</s>
+          <s>Annual maps of Shifting Cultivation from 1991 to 2020 were created by combining time series analysis, object-based image analysis, and post-disturbed land-cover classification.</s>
+          <s>A total of 1000 sample units under simple random sampling were used as reference data for accuracy assessment and area estimation.</s>
+          <s>For each sample unit, at least two interpreters interpreted the land change class and the year of each slash-and-burn event by examining high-resolution satellite imagery and Landsat time series (figures 1(a), (c) and 2).</s>
+          <s>During 1991-2020, shifting cultivation was the main type of forest disturbance in Laos, affecting 32.9 ± 1.9% of Laos <ref                            type="bibr" target="#b5">(Chen et al 2023)</ref>.</s>
+          <s>Shifting cultivation was mapped with a producer's accuracy of 88% and a user's accuracy of 80% <ref                            type="bibr" target="#b5">(Chen et al 2023)</ref>.</s>
+          <s>
+            <ref type="bibr" target="#b5">Chen et al (2023)</ref>                        describes more details of the monitoring method.</s>
+            <s>Both the map products and the reference data from <ref type="bibr"                                                                              target="#b5">Chen et al (2023)</ref> were                        used in this study.</s>
+          </p>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="2.2.">Spatial-temporal patterns of shifting cultivation</head>
+          <p>
+            <s>The annual maps of shifting cultivation  and the reference sample units interpreted as Shifting Cultivation were used to investigate the patterns of shifting cultivation.</s>
+            <s>We estimated the area of shifting cultivation at different fallow and disturbance   55 ′ 27 ′′ E. In the time series plot, the blue points are the Landsat observations.</s>
+            <s>In the Landsat images (Red-green-blue), the yellow squares show the pixel location.</s>
+            <s>In the high-resolution image, the white point shows the pixel location.).</s>
+          </p>
+          <p>
+            <s>to explore whether the extent of shifting cultivation expanded, the newly and previously cultivated areas of shifting cultivation were estimated using reference sample units and maps in Chen et al (2023) for every 5 year period from 2000 to 2020.</s>
+            <s>In the reference sample points, whether a pixel is newly or previously cultivated is determined by the year of slash-andburn recorded by the interpreters (e.g.</s>
+            <s>figures 1(a), (c) and 2).</s>
+          </p>
+          <p>
+            <s>Furthermore, to investigate the change patterns in fallow length and cultivation length (length of cropping period), we visually interpreted Landsat time series, Landsat imagery, and high-resolution images for 196 sample points (figure <ref                        type="figure">3</ref> as an example).</s>
+            <s>These sample points are the points with at least two cultivation events in the aforementioned reference data with 1000 simple random sample points.</s>
+            <s>For each point, the year of slash and burn (land clearing), cultivation length, and fallow length are recorded for every event.</s>
+          </p>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="2.3.">Carbon emission/removal</head>
+          <p>
+            <s>The GEDI mission provides space-borne LiDAR data to estimate aboveground biomass (Healey et al 2020).</s>
+            <s>GEDI's L4A Footprint Level Aboveground Biomass Density (AGBD) (version 2.1) 25 m data <ref                            type="bibr" target="#b1">(Beck et al 2020</ref>
+            <ref type="bibr" target="#b11">, Dubayah et al 2022)</ref>                        were used to explore the effect of shifting cultivation on biomass.</s>
+            <s>GEDI data collected in 2020 was used because it was the only year of data with good spatial coverage when the study was conducted.</s>
+            <s>To overlay the GEDI footprint and Landsat, for each GEDI footprint, we extracted the value of the 30 m pixel in the Landsat-based map that has the largest overlap with the 25 m footprint.</s>
+            <s>Only lidar observations with good quality (using the 'quality_flag' band and the 'degrade_flag' band) and collected at places with a slope less than 20 • and in the interior of shifting cultivation sites (excluding a two-pixel edge) were used, to eliminate the effect of terrain and possible misregistration at the edges of slash-and-burn events.</s>
+            <s>The reason why we excluded lidar points with slopes larger than 20 • is that GEDIbased biomass estimates tend to be overestimated at steep terrain.</s>
+            <s>AGBD was calculated for Active Shifting Cultivation, Inactive Shifting Cultivation, Intact Forest, and Others.</s>
+            <s>Intact Forest here is defined as forests without significant anthropogenic disturbances.</s>
+            <s>The relationship between AGBD and years of regrowth since the latest slash-and-burn events was analyzed.</s>
+            <s>The hypothesis was that AGBD has a positive relationship with years of regrowth since the latest slash-andburn activity.</s>
+            <s>From this relationship, a country-level growth curve of AGBD can be developed and used to estimate the biomass of fallow lands.</s>
+          </p>
+          <p>
+            <s>Carbon emissions from shifting cultivation were estimated for every 5 year period from 2001 to 2020.</s>
+            <s>Table <ref type="table" target="#tab_1">1</ref> shows the activity classes, definitions, and                        emission factors.</s>
+            <s>New Shifting Cultivation area was estimated from a sampling-based method The emission factors for activities other than New Shifting Cultivation are spatially explicit and were determined by the map of the latest year of slash and burn and the growth curve.</s>
+            <s>Figure <ref type="figure" target="#fig_7">8</ref> shows an example of the spatially explicit                        emission factors for different activities.</s>
+            <s>Specifically, this was how the carbon emissions and removals of Fallow land -&gt; Fallow land, Fallow land -&gt; Cleared land, and Cleared land -&gt; Fallow land were calculated: The latest year of disturbance of Fallow land was determined using the annual shifting cultivation maps.</s>
+            <s>Then, the AGBD of fallow lands was calculated using equation (1).</s>
+            <s>Using AGBD of fallow land in the end year minus AGBD in the start year of each period, the differences in AGBD were obtained.</s>
+            <s>Multiply the differences in AGBD by the area of different activities and then multiply it by the conversion factor (0.5), and the carbon emissions and removals of each activity were calculated.</s>
+            <s>The average emission/removal factors were calculated using the emissions and removals divided by the total area of activities in different categories.</s>
+          </p>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="3.">Results</head>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="3.1.">Spatial-temporal patterns of shifting cultivation</head>
+          <p>
+            <s>A large proportion of the land used for shifting cultivation in Laos remains in use.</s>
+            <s>During our study period, the estimated area of Active Shifting Cultivation (19.1 ± 1.6%) exceeded the area of Inactive Shifting Cultivation (13.7 ± 1.8%).</s>
+            <s>In the future, there is a possibility of reusing Inactive Shifting Cultivation and further increasing the area of Active Shifting Cultivation, given the increasing demand for crops.</s>
+            <s>New Shifting Cultivation, defined as shifting cultivation that first occurred in each period, was estimated from 2001 to 2020 by period (figure <ref                            type="figure" target="#fig_2">4</ref>).</s>
+            <s>The area estimates were aggregated into 5 year periods instead of calculating annual to reduce uncertainties of the area estimates.</s>
+            <s>From 1991 to 2000, it is difficult to tell whether the shifting cultivation areas were new or old, and thus this analysis started in 2001.</s>
+            <s>In all 5 year periods, the area of New Shifting Cultivation is higher than 3% of Laos, implying that on average, over 0.6% of Laos' land area is converted from intact forest to shifting cultivation each year.</s>
+            <s>Our results indicate that the extent of shifting cultivation has been expanding.</s>
+          </p>
+          <p>
+            <s>During 2001-2015, there was a decrease in the area of New Shifting Cultivation.</s>
+            <s>However, both the area of New Shifting Cultivation and the total area of Shifting Cultivation have increased significantly in 2016-2020.</s>
+            <s>The proportion of previously and newly cultivated to the total area of shifting cultivation was calculated for every year using the annual maps (figure <ref                            type="figure" target="#fig_3">5</ref>).</s>
+            <s>Before 2007, the newly cultivated  areas were larger than the previously cultivated, and the trend reversed after 2007.</s>
+            <s>There was a general decreasing trend in the proportion of New Shifting Cultivation, but increases were observed in 2019 and 2020.</s>
+            <s>We suppose that the general decreasing trend is because intact forests available for cultivation decreased over time and previously cultivated land is easier to clear for future cultivation.</s>
+          </p>
+          <p>
+            <s>Based on the sample interpretation results, most cultivation lengths are either one year or two years.</s>
+            <s>Although there are variations across the years, we have not seen major changes in average cultivation length (figures S1 and S2).</s>
+            <s>The mean length of the fallow periods of shifting cultivation in Laos is 6.5 years, which is close to the length of fallow periods reported in the literature (7 years) (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018).</s>
+            <s>The fallow length has been continuously declining (figure <ref type="figure"                                                                                      target="#fig_4">6</ref>).</s>
+            <s>The reduction in the length of fallow periods indicates that shifting cultivation has intensified.</s>
+          </p>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="3.2.">Growth curve of fallow lands</head>
+          <p>
+            <s>The AGBD was lower in shifting cultivation regions than in the intact forests.</s>
+            <s>The median AGBD of Intact Forest, Inactive Shifting Cultivation, Active Shifting Cultivation, and Others are 151.9</s>
+            <s>Mg ha -1 , 87.9 Mg ha -1 , 39.5 Mg ha -1 , and 22.8 Mg ha -1 , respectively.</s>
+            <s>The biomass of Inactive Shifting Cultivation only reached about 60% of that of the intact forest.</s>
+            <s>In the literature (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018), the regions of Inactive Shifting Cultivation were considered to be 'recovered' , whereas our results show that the AGBD is not recovered even if these regions have been left for fallow for at least seven years.</s>
+            <s>To investigate the relationship between median AGBD and disturbance history, a logarithmic regression was conducted on years of regrowth since the latest slash-and-burn events and median AGBD of GEDI footprints (figure <ref                            type="figure" target="#fig_5">7</ref>).</s>
+            <s>The logarithmic model of years of regrowth (x) and AGBD (y) is (R square is 0.93): y = 29.129</s>
+            <s>ln (x) + 9.907</s>
+          </p>
+          <p>
+            <s>(1)</s>
+          </p>
+          <p>
+            <s>AGBD was strongly correlated with years of regrowth.</s>
+            <s>Equation ( <ref type="formula">1</ref>) and the maps of years of regrowth were used to calculate                        the biomass of fallow lands and spatially explicit emission/removal factors (figure <ref                                type="figure" target="#fig_7">8</ref>). in this period (figure <ref type="figure"                                                                                                    target="#fig_3">5</ref>)                        and the decrease in carbon sink of fallow lands in this period.</s>
+            <s>For every period, New Shifting Cultivation is the largest carbon source, contributing to more than 80% of the total emissions.</s>
+            <s>From 2001 to 2020, New Shifting Cultivation contributed to 89% of the total emissions.</s>
+            <s>Fallow lands are important carbon sinks and sequestered about 70% of the total emissions during 2006-2015.</s>
+            <s>However, carbon sequestration of fallow lands also decreased in recent years because of the intensified use of fallow land.</s>
+            <s>To summarize, the increase in emissions from shifting cultivation encroachment to intact forests (New Shifting Cultivation) and intensified use of secondary forests both led to the recent increase in net emissions from shifting cultivation.</s>
+          </p>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="3.3.">Carbon emissions from shifting cultivation</head>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="4.">Discussion</head>
+          <p>
+            <s>In this study, the spatial-temporal patterns and the carbon dynamics of shifting cultivation in Laos were analyzed.</s>
+            <s>The results showed that shifting cultivation has been expanding and intensifying.</s>
+            <s>The area of shifting cultivation has increased significantly over the last 5 years.</s>
+            <s>The fallow length has been declining continuously, which indicates the intensification of shifting cultivation.</s>
+            <s>Our finding of a reduction of fallow length is consistent with previous local studies <ref                            type="bibr">(Rasul and Thapa 2003</ref>
+            <ref type="bibr">, Saphangthong and Kono 2009</ref>
+            <ref type="bibr">, van Vliet et al 2012)</ref>.</s>
+            <s>We found that AGBD was strongly correlated with years of regrowth since the latest year of slash-and-burn activities, which can be   <ref                            type="formula">2019</ref>) is understandable since their major focus is forest loss instead                        of shifting cultivation.</s>
+            <s>This comparison is not a criticism of the aforementioned studies.</s>
+            <s>Instead, it highlights the benefits of using shifting cultivation maps and reference samples with better spatial resolution and high temporal frequency for the analysis of spatial-temporal patterns.</s>
+          </p>
+          <p>
+            <s>We compared our area estimates of New Shifting Cultivation with the official forest change statistics from Laos (table <ref                        type="table" target="#tab_1">S1</ref>).</s>
+            <s>The Laos official forest change maps (<ref type="url" target="https://nfms.maf.gov.la/">https://nfms.maf.gov.la/</ref>)                        are created from the land cover classification maps from the start year and end year for each                        period (see the periods in table <ref type="table" target="#tab_1">S1</ref>).</s>
+            <s>Since shifting cultivation is the major driver of forest degradation and deforestation in Laos, we expect that there are some consistencies between the areas of New Shifting cultivation and the areas of forest degradation and deforestation.</s>
+            <s>There are consistencies in the period <ref type="bibr">2006-2010 and 2011-2015, with</ref> the                        differences between our estimates and the official statistics both less than 1% of Laos.</s>
+            <s>Our estimates of New Shifting Cultivation are generally higher than the Laos official estimates of deforestation and forest degradation, except for 2006-2010.</s>
+            <s>This was partly due to the different monitoring approaches.</s>
+            <s>Without using dense time series, the shifting cultivation events that occurred over five years may be difficult to detect using two classification maps from the start and the end.</s>
+            <s>In the period <ref type="bibr">2001-2005 and 2016-2020, our</ref> estimates are about 2%-3%                        higher than the official estimates.</s>
+            <s>For 2016-2020, the discrepancy is partly because the 2019 and 2020 changes are included in our estimates but not in the official statistics.</s>
+            <s>Overall, our results and area estimates provide valuable information regarding the forest dynamics of Laos.</s>
+          </p>
+          <p>
+            <s>Furthermore, we compared the shifting cultivation map with the field survey data in the Laos National Forest Monitoring System<ref                        type="url" target="https://nfms.maf.gov.la/">https://nfms. maf.gov.la/</ref>.</s>
+            <s>The shifting cultivation map was compared with 39 field points identified as <ref type="bibr">'Regenerating Vegetation' or 'Upland crop' in 2010</ref>
+            <ref type="bibr" target="#b12">, 2011</ref>
+            <ref type="bibr">, 2012</ref>
+            <ref type="bibr" target="#b0">, or 2019</ref>, since these two land cover classes are generally                        considered to have an association with shifting cultivation practices (Department of Forestry                        2020).</s>
+            <s>The 31 out of 39 (80%) points are correctly mapped as shifting cultivation.</s>
+          </p>
+          <p>
+            <s>As a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos, our research is valuable to sustainable land resource management.</s>
+            <s>The sustainability of the land is negatively impacted by the recent expansion and intensification of shifting cultivation, indicated by an increase in newly cultivated areas in 2016-2020 and a reduction of fallow length in 1991-2020.</s>
+            <s>Moreover, our research provides a quantitative analysis of carbon emissions of shifting cultivation, which is crucial for REDD+ reporting in Laos.</s>
+            <s>Our research indicates that carbon emissions from shifting cultivation can be quantified by combining GEDI data with shifting cultivation maps and area estimates.</s>
+            <s>The fallow land sequestrated a significant amount of carbon in the past, but this carbon sink declined in recent years.</s>
+            <s>The recent increase in new shifting cultivation events also led to an increase in net carbon emissions.</s>
+            <s>This highlights the importance of protecting the primary forest from the encroachment of new shifting cultivation and the restoration of old fallow lands.</s>
+          </p>
+          <p>
+            <s>Our study has several limitations and future research can make improvements by using more sophisticated models and integration with other data.</s>
+            <s>The first limitation is the usage of GEDI data.</s>
+            <s>Our research only used GEDI in one year (2020), because GEDI is a new mission and 2020 was the only year with good coverage data when the study was conducted.</s>
+            <s>Future studies can use GEDI for multiple years as more data will be collected.</s>
+            <s>In addition, we excluded GEDI points where the slope is larger than 20 • to avoid overestimation of biomass in steep terrain.</s>
+            <s>This would introduce regional bias on the growth curve and emission factors.</s>
+            <s>Based on our map, 69% of the shifting cultivation area is in places with slopes less than 20 • <ref                            type="bibr" target="#b4">(Chen 2022)</ref>.</s>
+            <s>Future research should improve GEDI biomass estimates in steep terrain.</s>
+            <s>Second, although we compared our map with some field survey data in Laos, the field data information for each location is limited.</s>
+            <s>Future studies should collect more detailed information on shifting cultivation in field surveys, especially biomass in shifting cultivation landscapes (e.g.</s>
+            <s>Salinas-Melgoza et al 2017, <ref type="bibr" target="#b2">Borah et al 2018</ref>
+            <ref type="bibr" target="#b13">, Gogoi et al 2020 )</ref>.</s>
+            <s>Third, the carbon estimation only considered aboveground biomass change and no other carbon pools due to a lack of field survey data on those carbon pools.</s>
+            <s>Future research can conduct field surveys on belowground biomass and include the belowground carbon pools in carbon emission estimation.</s>
+            <s>Fourth, future research should investigate the causes of the recent increase in shifting cultivation, which requires field surveys.</s>
+          </p>
+        </div>
+        <div xmlns="http://www.tei-c.org/ns/1.0">
+          <head n="5.">Conclusion</head>
+          <p>
+            <s>Our research provides a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos.</s>
+            <s>Our analysis shows that shifting cultivation in Laos has been expanding and intensifying, particularly in the recent 5 years.</s>
+            <s>The practice of shifting cultivation has become increasingly intensive as the length of the fallow periods has been continuously shortening.</s>
+            <s>Combining GEDI data with shifting cultivation maps and area estimates, carbon emissions from shifting cultivation can be quantified.</s>
+            <s>The net carbon emissions from shifting cultivation declined in the past but increased recently.</s>
+            <s>This study not only supports REDD+ reporting for Laos but also demonstrates a method of tracking carbon dynamics in shifting cultivation landscapes.</s>
+          </p>
+          <p>
+            <s>USGS Landsat Science Team Program for Better Use of the Landsat Temporal Domain: Monitoring Land Cover Type, Condition and Change (Grant Number: G12PC00070).</s>
+            <s>The authors are grateful to the editors and two anonymous reviewers for their insightful and constructive comments, which greatly helped to improve this paper.</s>
+          </p>
+        </div>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0">
+          <head>Figure 1 .</head>
+          <label>1</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 1.</s>
+                <s>An example of active shifting cultivation in previously cultivated land (location: 20 • 7 ′ 13 ′′ N, 101 • 6 ′ 59 ′′ E).</s>
+                <s>The shifting cultivation events in 2005 and 2018 were categorized as Previous Shifting Cultivation because shifting cultivation first occurred in 1991.</s>
+                <s>This place is also Active Shifting Cultivation because the latest shifting cultivation event occurred in 2018.</s>
+                <s>(a) Landsat time series.</s>
+                <s>(b) CCDC-SMA model fits.</s>
+                <s>Different colors show different segments and the model breaks in 1991, 2005, and 2018 show slash and burn events.</s>
+                <s>The colored lines show the seasonality of the forest and the drops between lines show slash-and-burn events.</s>
+                <s>(c) Landsat images and high-resolution images on Google Earth.</s>
+                <s>In the Landsat images (red-green-blue), the yellow squares show the pixel location.</s>
+                <s>In the high-resolution image, the white point shows the pixel location.</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="3,129.32,72.89,414.04,288.65" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1">
+          <head>Figure 2 .Figure 3 .</head>
+          <label>23</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 2.</s>
+                <s>An example of reference data (location: 20 • 15 ′ 8 ′′ N, 100• 39 ′ 51 ′′ E).</s>
+                <s>This shifting cultivation is New Shifting Cultivation.</s>
+                <s>The time series shows that no shifting cultivation occurred before 2019.</s>
+                <s>The new shifting cultivation event occurred in 2019 and it can be verified by examining high-resolution images and Landsat images.</s>
+                <s>(In the time series figure, the blue points are Landsat observations.</s>
+                <s>In the Landsat images (red-green-blue), the yellow squares show the pixel location.</s>
+                <s>In the high-resolution image, the white point shows the pixel location).</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="3,129.32,467.97,414.04,194.32" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2">
+          <head>Figure 4 .</head>
+          <label>4</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 4. Area estimates and uncertainties of New Shifting Cultivation and total (new and previous) shifting cultivation by 5 year intervals.The y-axis is the area proportions of the total area of Laos (230 405 km 2 ).</s>
+                <s>Any pixel that was newly cultivated at any time within a specified 5 year period would be counted and added to the total height of the corresponding pink bars.</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="6,200.85,72.89,270.04,212.12" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3">
+          <head>Figure 5 .</head>
+          <label>5</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 5. Annual proportion of slash-and-burn areas in previously and newly cultivated regions.</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="6,200.93,355.44,269.88,140.93" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4">
+          <head>Figure 6 .</head>
+          <label>6</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 6.</s>
+                <s>Average fallow length by year calculated from sample interpretation.</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="7,129.32,72.89,414.04,187.04" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5">
+          <head>Figure 7 .</head>
+          <label>7</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 7. Growth curve of aboveground biomass density.</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="7,200.85,315.86,270.04,261.63" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6">
+          <head/>
+          <label/>
+          <figDesc>
+            <div>
+              <p>
+                <s>Carbon emissions from shifting cultivation were estimated by period from 2001 to 2020 (tables 2-4, and figure 9).</s>
+                <s>The net carbon emissions of shifting cultivation declined during 2001-2015, but significantly increased during 2016-2020.</s>
+                <s>The decline in net carbon emissions during 2001-2015 is mostly because the area of new shifting cultivation decreased in this period.</s>
+                <s>The increase during 2016-2020 is mostly due to the increase in new shifting cultivation activities</s>
+              </p>
+            </div>
+          </figDesc>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_7">
+          <head>Figure 8 .</head>
+          <label>8</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 8. Spatially explicit emission (+)/removal (-) factors for different activities in a region (the background image is the high-resolution image): (a) fallow land -&gt; fallow land; (b) fallow land -&gt; cleared land; (c) cleared land -&gt; fallow land; (d) total of (a)-(c).</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="8,200.85,72.89,270.04,254.35" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_8">
+          <head>Figure 9 .</head>
+          <label>9</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Figure 9. Carbon dynamics by period.</s>
+              </p>
+            </div>
+          </figDesc>
+          <graphic coords="9,200.85,247.38,270.04,154.19" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0">
+          <head/>
+          <label/>
+          <figDesc>
+            <div>
+              <p/>
+            </div>
+          </figDesc>
+          <graphic coords="4,129.32,72.89,414.04,284.93" type="bitmap"/>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1">
+          <head>Table 1 .</head>
+          <label>1</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Activity classes, definitions, and carbon emission/removal factors for each 5 year period (CF: conversion factor to convert biomass to carbon equivalents, CF = 0.5).</s>
+              </p>
+            </div>
+          </figDesc>
+          <table>
+            <row>
+              <cell>Activity class</cell>
+              <cell>Definition</cell>
+              <cell>Emission/removal factors</cell>
+            </row>
+            <row>
+              <cell>Intact forest -&gt; shifting cultivation</cell>
+              <cell>No shifting cultivation before. Previous intact</cell>
+              <cell>Biomass of forest before</cell>
+            </row>
+            <row>
+              <cell>(New Shifting Cultivation)</cell>
+              <cell>forests began to be used for new shifting</cell>
+              <cell>new shifting</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell>cultivation.</cell>
+              <cell>cultivation × CF (75.95</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell/>
+              <cell>Mg C ha -1 )</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; fallow land</cell>
+              <cell>Shifting cultivation occurred before. The start</cell>
+              <cell>(Fallow land biomass in the</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell>and end land cover were both fallow lands.</cell>
+              <cell>start -fallow land biomass</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell/>
+              <cell>in the end) × CF</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; cleared land</cell>
+              <cell>In previously cultivated land, fallow land became</cell>
+              <cell>(Fallow land</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell>cleared land.</cell>
+              <cell>biomass -cleared land</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell/>
+              <cell>biomass) × CF</cell>
+            </row>
+            <row>
+              <cell>Cleared land -&gt; fallow land</cell>
+              <cell>In previously cultivated land, cleared land became</cell>
+              <cell>(Cleared land</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell>fallow land.</cell>
+              <cell>biomass -fallow land</cell>
+            </row>
+            <row>
+              <cell/>
+              <cell/>
+              <cell>biomass) × CF</cell>
+            </row>
+            <row>
+              <cell>Cleared land -&gt; cleared land</cell>
+              <cell>In previously cultivated land, cleared land became</cell>
+              <cell/>
+            </row>
+            <row>
+              <cell/>
+              <cell>cleared land.</cell>
+              <cell/>
+            </row>
+          </table>
+          <note>
+            <p>
+              <s>Zeroand other activity classes in table 1 were estimated from the maps.This is because the samplingbased area estimates of New Shifting Cultivation adjusted errors in mapping and are more accurate than pixel-counting from the maps(Olofsson  et al 2013(Olofsson  et al  , 2014)).</s>
+              <s>The area estimates of New Shifting Cultivation were calculated by 5-year periods with low uncertainty.</s>
+              <s>For other activity classes, it is difficult to get area estimates from the reference data while including the dynamics of biomass of fallow land, and thus we used a spatially explicit method.</s>
+              <s>In table 1, the biomass of the forest before disturbance was the biomass of Intact Forest estimated from GEDI.</s>
+              <s>The biomass of fallow land was estimated from the growth curve developed from GEDI based on years since disturbance.</s>
+              <s>Years since disturbance for each pixel was obtained from the annual maps of shifting cultivation.</s>
+              <s>The cleared land biomass was estimated as the biomass of non-forest by the Department of Forestry (2020) based on field surveys.</s>
+              <s>The emission factor of New Shifting Cultivation is 75.95</s>
+              <s>Mg C ha -1 .</s>
+              <s>The emission factor of Cleared land -&gt; Cleared land is zero.</s>
+            </p>
+          </note>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2">
+          <head>Table 2 .</head>
+          <label>2</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Area of difference land use activities for each period (5 years).</s>
+              </p>
+            </div>
+          </figDesc>
+          <table>
+            <row>
+              <cell>Area (ha)</cell>
+              <cell>2001-2005</cell>
+              <cell>2006-2010</cell>
+              <cell>2011-2015</cell>
+              <cell>2016-2020</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; fallow land</cell>
+              <cell>2379 847</cell>
+              <cell>3809 008</cell>
+              <cell>5213 561</cell>
+              <cell>6009 880</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; cleared land</cell>
+              <cell>226 240</cell>
+              <cell>361 992</cell>
+              <cell>397 236</cell>
+              <cell>630 467</cell>
+            </row>
+            <row>
+              <cell>Cleared land -&gt; fallow land</cell>
+              <cell>441 757</cell>
+              <cell>768 342</cell>
+              <cell>748 692</cell>
+              <cell>696 501</cell>
+            </row>
+            <row>
+              <cell>New shifting cultivation</cell>
+              <cell>1198 106</cell>
+              <cell>806 418</cell>
+              <cell>714 256</cell>
+              <cell>1036 823</cell>
+            </row>
+          </table>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3">
+          <head>Table 3 .</head>
+          <label>3</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>The country-average emissions or removal factors for each period (5 years).</s>
+                <s>The original emission or removal factors except for new shifting cultivation are spatially explicit.</s>
+                <s>This table shows the country averages of the spatial explicit emission or removal factors.</s>
+              </p>
+            </div>
+          </figDesc>
+          <table>
+            <row>
+              <cell>Average emission/removal factors (Mg C/ha -1 )</cell>
+              <cell>2001-2005</cell>
+              <cell>2006-2010</cell>
+              <cell>2011-2015</cell>
+              <cell>2016-2020</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; fallow land</cell>
+              <cell>-8.06</cell>
+              <cell>-7.57</cell>
+              <cell>-5.65</cell>
+              <cell>-1.56</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; cleared land</cell>
+              <cell>18.70</cell>
+              <cell>19.26</cell>
+              <cell>23.58</cell>
+              <cell>26.10</cell>
+            </row>
+            <row>
+              <cell>Cleared land -&gt; fallow land</cell>
+              <cell>-23.14</cell>
+              <cell>-24.28</cell>
+              <cell>-23.67</cell>
+              <cell>-21.36</cell>
+            </row>
+            <row>
+              <cell>New shifting cultivation</cell>
+              <cell>75.95</cell>
+              <cell>75.95</cell>
+              <cell>75.95</cell>
+              <cell>75.95</cell>
+            </row>
+          </table>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4">
+          <head>Table 4 .</head>
+          <label>4</label>
+          <figDesc>
+            <div>
+              <p>
+                <s>Carbon emissions (+) and removals (-) of different activities for each period (5 years).</s>
+              </p>
+            </div>
+          </figDesc>
+          <table>
+            <row>
+              <cell>Carbon emission/removal</cell>
+              <cell/>
+              <cell/>
+              <cell/>
+              <cell/>
+            </row>
+            <row>
+              <cell>(Mg C)</cell>
+              <cell>2001-2005</cell>
+              <cell>2006-2010</cell>
+              <cell>2011-2015</cell>
+              <cell>2016-2020</cell>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; fallow</cell>
+              <cell>-19 175 009</cell>
+              <cell>-28 833 216</cell>
+              <cell>-29 440 602</cell>
+              <cell>-9348 118</cell>
+            </row>
+            <row>
+              <cell>land</cell>
+              <cell/>
+              <cell/>
+              <cell/>
+              <cell/>
+            </row>
+            <row>
+              <cell>Fallow land -&gt; cleared</cell>
+              <cell>4230 290</cell>
+              <cell>6970 956</cell>
+              <cell>9366 236</cell>
+              <cell>16 452 893</cell>
+            </row>
+            <row>
+              <cell>land</cell>
+              <cell/>
+              <cell/>
+              <cell/>
+              <cell/>
+            </row>
+            <row>
+              <cell>Cleared land -&gt; fallow</cell>
+              <cell>-10 222 046</cell>
+              <cell>-18 657 539</cell>
+              <cell>-17 717 827</cell>
+              <cell>-14 879 752</cell>
+            </row>
+            <row>
+              <cell>land</cell>
+              <cell/>
+              <cell/>
+              <cell/>
+              <cell/>
+            </row>
+            <row>
+              <cell>New shifting cultivation</cell>
+              <cell>90 996 151</cell>
+              <cell>61 247 409</cell>
+              <cell>54 247 705</cell>
+              <cell>78 746 669</cell>
+            </row>
+            <row>
+              <cell>Period total (net</cell>
+              <cell>65 829 387</cell>
+              <cell>20 727 610</cell>
+              <cell>16 455 512</cell>
+              <cell>70 971 692</cell>
+            </row>
+            <row>
+              <cell>emission/removal)</cell>
+              <cell/>
+              <cell/>
+              <cell/>
+              <cell/>
+            </row>
+            <row>
+              <cell>Annual average</cell>
+              <cell>13 165 877</cell>
+              <cell>4145 522</cell>
+              <cell>3291 103</cell>
+              <cell>14 194 339</cell>
+            </row>
+          </table>
+        </figure>
+        <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5">
+          <head/>
+          <label/>
+          <figDesc>
+            <div>
+              <p>
+                <s>Saphangthong T and Kono Y 2009 Continuity and discontinuity in land use changes: a case study in Northern Lao villages J. Southeast Asian Stud.</s>
+                <s>47 263-86 Silva J M N, Carreiras J M B, Rosa I and Pereira J M C 2011 Greenhouse gas emissions from shifting cultivation in the tropics, including uncertainty and sensitivity analysis J. Geophys.</s>
+                <s>Res.</s>
+                <s>Atmos.</s>
+                <s>116 1-21 Souza C M, Roberts D A and Cochrane M A 2005 Combining spectral and spatial information to map canopy damage from selective logging and forest fires Remote Sens. Environ.</s>
+                <s>98 329-43 Tang X, Hutyra L R, Arévalo P, Baccini A, Woodcock C E and Olofsson P 2020 Spatiotemporal tracking of carbon emissions and uptake using time series analysis of Landsat data: a spatially explicit carbon bookkeeping model Sci.</s>
+                <s>Total Environ.</s>
+                <s>720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)</s>
+                <s>van Vliet N et al 2012 Trends, drivers and impacts of changes in swidden cultivation in tropical forest-agriculture frontiers: a global assessment Glob.</s>
+                <s>Environ.</s>
+                <s>Change 22 418-29 World Bank 2023 World Bank open data (available at: https:// data.worldbank.org/) Zhu Z and Woodcock C E 2014 Continuous change detection and classification of land cover using all available Landsat data Remote Sens. Environ.</s>
+                <s>144 152-71 Ziegler A D et al 2012 Carbon outcomes of major land-cover transitions in SE Asia: great uncertainties and REDD+ policy implications Glob.</s>
+                <s>Change Biol.</s>
+                <s>18 3087-99</s>
+              </p>
+            </div>
+          </figDesc>
+          <table/>
+        </figure>
+      </body>
+      <back>
+        <div type="acknowledgement">
+          <div>
+            <head>Acknowledgments</head>
+            <p>This research was funded by the <rs type="funder">NASA</rs>
+            <rs type="programName">Land-Cover and Land-Use Change Program</rs> (Grant Number: <rs type="grantNumber">80NSSC18K0315</rs>),        the <rs type="funder">NASA Carbon Monitoring System</rs> (Grant Number: <rs                type="grantNumber">80NSSC20K0022</rs>), and Turubanova S A A, Tyukavina A and Kommareddy A 2013        High-resolution global maps of 21st-century forest cover change Science 342 850-3 Healey S P, Yang Z, <rs                type="projectName">Gorelick N and Ilyushchenko S 2020 Highly local model calibration with a new GEDI LiDAR</rs> asset        on Google Earth Engine reduces Landsat forest height signal saturation Remote Sens. 12 2840 <rs                type="affiliation">Heinimann A, Hett C, Hurni K, Messerli P</rs>, Epprecht M, Jørgensen L and Breu T        2013 Socio-economic perspectives on shifting cultivation landscapes in <rs type="affiliation">Northern Laos Hum.Ecol.41 51-62 Heinimann A, Mertz O, Frolking S, Christensen A E, Hurni K, Sedano F, Chini L P, Sahajpal R, Hansen</rs> M        and Hurtt G 2017 A global view of shifting cultivation: recent, current, and future extent PLoS One 12 1-22 Hett        C, Castella J C, Heinimann A, Messerli P and Pfund J L 2012 A landscape mosaics approach for characterizing        swidden systems from a REDD+ perspective Appl.Geogr.32 608-18 Hurni K, Hett C, Epprecht M, Messerli P and        Heinimann A 2013a A texture-based land cover classification for the delineation of a shifting cultivation        landscape in the Lao PDR using landscape metrics Remote Sens. 5 3377-96 Jiang N, Li P and Feng Z 2022 Remote        sensing of swidden agriculture in the tropics: a review Int.J. Appl.Earth Obs.Geoinf.112 102876 <rs                type="person">Kurien A J, Lele</rs> S and Nagendra H 2019 Farms or forests?Understanding and mapping        shifting cultivation using the case study of <rs type="person">West Garo Hills</rs>, <rs type="person">India Land 8 133 Leisz S J and Rasmussen M S 2012 Mapping</rs> fallow        lands in Vietnam's north-central mountains using yearly Landsat imagery and a land-cover succession model Int.J.        Remote Sens. 33 6281-303 Manivong V and <rs type="person">Cramb R 2020 From</rs> subsistence to commercial rice        production in Laos White Gold: The <rs type="funder">Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan</rs>)        pp 103-19 Messerli P, Heinimann A and Epprecht M 2009 Finding homogeneity in heterogeneity-a new approach to        quantifying landscape mosaics developed for the <rs type="institution">Lao PDR Hum</rs>.Ecol.37 291-304        Molinario G, Hansen M C and Potapov P V 2015 Forest cover dynamics of shifting cultivation in the Democratic        Republic of Congo: a remote sensing-based assessment for 2000-2010 Environ.Res.Lett. 10 094009 Olofsson P, <rs                type="person">Foody G M</rs>, <rs type="person">Herold M, Stehman S V</rs>, Woodcock C E and Wulder M A        2014 Good practices for estimating area and assessing accuracy of land change Remote Sens. Environ.148 42-57        Olofsson P, <rs type="person">Foody G M, Stehman S V</rs> and Woodcock C E 2013 Making better use of accuracy        data in land change studies: estimating accuracy and area and quantifying uncertainty using stratified        estimation Remote Sens. Environ.129 122-31 Potapov P et al 2019 Annual continuous fields of woody vegetation        structure in the Lower Mekong region from 2000-2017 Landsat time-series Remote Sens. Environ.232 111278 Rasul G        and Thapa G B 2003 Shifting cultivation in the mountains of South and Southeast Asia: regional patterns and        factors influencing the change <rs type="person">Land Degrad.Dev</rs>. <rs type="grantNumber">14 495-508</rs> Roder        W 2001 <rs type="projectName">Slash-And-Burn Rice Systems in The Hills of Northern Lao PDR: Description, Challenges, And Opportunities (International Rice Research Institute</rs>)        (available at:http://lad.nafri.org.la/fulltext/231-0.pdf)<rs type="projectName">Salinas-Melgoza M A, Skutsch M, Lovett J C and Borrego A 2017 Carbon emissions from dryland shifting cultivation: a case study of Mexican tropical dry forest Silva Fenn</rs>.<rs                type="grantNumber">51</rs> 1553</p>
+          </div>
+        </div>
+        <listOrg type="funding">
+          <org type="funding" xml:id="_JUak8EW">
+            <idno type="grant-number">80NSSC18K0315</idno>
+            <orgName type="program" subtype="full">Land-Cover and Land-Use Change Program</orgName>
+          </org>
+          <org type="funded-project" xml:id="_37TZA8z">
+            <idno type="grant-number">80NSSC20K0022</idno>
+            <orgName type="project" subtype="full">Gorelick N and Ilyushchenko S 2020 Highly local model calibration with a new GEDI LiDAR</orgName>
+          </org>
+          <org type="funded-project" xml:id="_dR9zdmy">
+            <idno type="grant-number">14 495-508</idno>
+            <orgName type="project" subtype="full">Slash-And-Burn Rice Systems in The Hills of Northern Lao PDR: Description, Challenges, And Opportunities (International Rice Research Institute</orgName>
+          </org>
+          <org type="funded-project" xml:id="_6cR6r98">
+            <idno type="grant-number">51</idno>
+            <orgName type="project" subtype="full">Salinas-Melgoza M A, Skutsch M, Lovett J C and Borrego A 2017 Carbon emissions from dryland shifting cultivation: a case study of Mexican tropical dry forest Silva Fenn</orgName>
+          </org>
+        </listOrg>
+        <div type="availability">
+          <div xmlns="http://www.tei-c.org/ns/1.0">
+            <head>Data availability statements</head>
+            <p>
+              <s>Google Earth Engine applications to visualize the datasets:<ref type="url"                                                                          target="https://github.com/shijuanchen/shift_cult">https://github.com/shijuanchen/shift_cult</ref> Map        products visualization:        <ref type="url" target="https://sites.google.com/view/shijuanchen/research/shift_cult">https://sites.google. com/view/shijuanchen/research/shift_cult</ref>
+            </s>
+          </p>
+          <p>
+            <s>The data that support the findings of this study are openly available at the following URL/DOI:<ref type="url"                                                                                                              target="https://doi.org/10.5281/zenodo.7782782">https:// doi.org/10.5281/zenodo.7782782</ref>.</s>
+          </p>
+        </div>
+      </div>
+      <div type="references">
+        <listBibl>
+          <biblStruct xml:id="b0">
+            <analytic>
+              <title level="a" type="main">Land use and land cover dynamics with special emphasis on shifting cultivation in Eastern Ghats Highlands of India using remote sensing data and GIS Environ</title>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <forename type="middle">P</forename>
+                  <surname>Adhikary</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">D</forename>
+                  <surname>Barman</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">M</forename>
+                  <surname>Madhu</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">C J</forename>
+                  <surname>Dash</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <surname>Jakhar</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">H</forename>
+                  <forename type="middle">C</forename>
+                  <surname>Hombegowda</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">B</forename>
+                  <forename type="middle">S</forename>
+                  <surname>Naik</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">D</forename>
+                  <surname>Sahoo</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">K</forename>
+                  <surname>Beer</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1007/s10661-019-7447-7</idno>
+            </analytic>
+            <monogr>
+              <title level="j">Monit. Assess</title>
+              <imprint>
+                <biblScope unit="volume">191</biblScope>
+                <biblScope unit="page" from="1" to="15"/>
+                <date type="published" when="2019">2019</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b1">
+            <monogr>
+              <author>
+                <persName>
+                  <forename type="first">J</forename>
+                  <surname>Beck</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">J</forename>
+                  <surname>Armston</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">M</forename>
+                  <surname>Hofton</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">S</forename>
+                  <surname>Luthcke</surname>
+                </persName>
+              </author>
+              <ptr target="https://daac.ornl.gov/GEDI/guides/GEDI_L4A_AGB_Density_V2_1"/>
+              <title level="m">Global Ecosystem Dynamics Investigation (GEDI) Level 02 User Guide</title>
+              <imprint>
+                <publisher>EROS Center, US Geological Survey</publisher>
+                <date type="published" when="2020">2020</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b2">
+            <analytic>
+              <title level="a" type="main">Quantifying carbon stocks in shifting cultivation landscapes under divergent management scenarios relevant to</title>
+              <author>
+                <persName>
+                  <forename type="first">J</forename>
+                  <forename type="middle">R</forename>
+                  <surname>Borah</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">K</forename>
+                  <surname>Evans</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">D P</forename>
+                  <surname>Edwards</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1002/eap.1764</idno>
+            </analytic>
+            <monogr>
+              <title level="j">REDD+ Ecol. Appl</title>
+              <imprint>
+                <biblScope unit="volume">28</biblScope>
+                <biblScope unit="page" from="1581" to="1593"/>
+                <date type="published" when="2018">2018</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b3">
+            <monogr>
+              <title level="m" type="main">Monitoring tropical forest degradation using spectral unmixing and Landsat time series analysis Remote Sens</title>
+              <author>
+                <persName>
+                  <forename type="first">E</forename>
+                  <surname>Bullock</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">L</forename>
+                  <surname>Woodcock</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">C</forename>
+                  <surname>Olofsson</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                </persName>
+              </author>
+              <idno type="DOI">10.1016/j.rse.2018.11.011</idno>
+              <imprint>
+                <date type="published" when="2020">2020</date>
+                <publisher>Environ</publisher>
+                <biblScope unit="volume">238</biblScope>
+                <biblScope unit="page">110968</biblScope>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b4">
+            <monogr>
+              <title level="m" type="main">Satellite-based monitoring, attribution, and analysis of forest degradation Doctoral Dissertation</title>
+              <author>
+                <persName>
+                  <forename type="first">S</forename>
+                  <surname>Chen</surname>
+                </persName>
+              </author>
+              <ptr target="https://open.bu.edu/handle/2144/46368"/>
+              <imprint>
+                <date type="published" when="2022">2022</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b5">
+            <monogr>
+              <title level="m"               type="main">Monitoring shifting cultivation in Laos with Landsat time series Remote Sens</title>
+              <author>
+                <persName>
+                  <forename type="first">S</forename>
+                  <surname>Chen</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <surname>Olofsson</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">T</forename>
+                  <surname>Saphangthong</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">C E</forename>
+                  <surname>Woodcock</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1016/j.rse.2023.113507</idno>
+              <imprint>
+                <date type="published" when="2023">2023</date>
+                <publisher>Environ</publisher>
+                <biblScope unit="volume">288</biblScope>
+                <biblScope unit="page">113507</biblScope>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b6">
+            <monogr>
+              <title level="m" type="main">Monitoring temperate forest degradation on Google Earth Engine using Landsat time series analysis Remote Sens</title>
+              <author>
+                <persName>
+                  <forename type="first">S</forename>
+                  <surname>Chen</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">C</forename>
+                  <forename type="middle">E</forename>
+                  <surname>Woodcock</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">E L</forename>
+                  <surname>Bullock</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <surname>Arévalo</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <surname>Torchinava</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">S</forename>
+                  <surname>Peng</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <surname>Olofsson</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1016/j.rse.2021.112648</idno>
+              <imprint>
+                <date type="published" when="2021">2021</date>
+                <publisher>Environ</publisher>
+                <biblScope unit="volume">265</biblScope>
+                <biblScope unit="page">112648</biblScope>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b7">
+            <analytic>
+              <title/>
+              <author>
+                <persName>
+                  <forename type="first">P G</forename>
+                  <surname>Curtis</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">C</forename>
+                  <forename type="middle">M</forename>
+                  <surname>Slay</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">N L</forename>
+                  <surname>Harris</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">A</forename>
+                  <surname>Tyukavina</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">M C</forename>
+                  <surname>Hansen</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1126/science.aau3445</idno>
+            </analytic>
+            <monogr>
+              <title level="j">Classifying drivers of global forest loss Science</title>
+              <imprint>
+                <biblScope unit="volume">361</biblScope>
+                <biblScope unit="page" from="1108" to="1111"/>
+                <date type="published" when="2018">2018</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b8">
+            <monogr>
+              <title level="m" type="main">Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018 Lao PDR's forest reference emission level and forest reference level for REDD+ results payment under the UNFCCC</title>
+              <ptr target="https://redd.unfccc.int/files/2018_frel_submission_laopdr.pdf"/>
+              <imprint/>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b9">
+            <monogr>
+              <title level="m" type="main">Lao People's Democratic Republic 1st national REDD+ results report for REDD+ results-based-payment under the UNFCCC</title>
+              <ptr target="http://dof.maf.gov.la/download/1st_national_redd_results_and_supporting_docs/LaoPDR_REDD-resultsUNFCCC_20200720_combined.pdf"/>
+              <imprint>
+                <date type="published" when="2020">2020</date>
+              </imprint>
+              <respStmt>
+                <orgName>Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR</orgName>
+              </respStmt>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b10">
+            <analytic>
+              <title level="a" type="main">Food security of shifting cultivation systems: case studies from Luang Prabang and Oudomxay Provinces</title>
+              <author>
+                <persName>
+                  <forename type="first">L</forename>
+                  <surname>Douangsavanh</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">A</forename>
+                  <surname>Polthanee</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">R</forename>
+                  <surname>Katawatin</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1007/s11629-006-0048-2.pdf</idno>
+            </analytic>
+            <monogr>
+              <title level="j">Lao PDR J. Mt. Sci</title>
+              <imprint>
+                <biblScope unit="volume">3</biblScope>
+                <biblScope unit="page" from="48" to="57"/>
+                <date type="published" when="2006">2006</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b11">
+            <monogr>
+              <author>
+                <persName>
+                  <forename type="first">R</forename>
+                  <surname>Dubayah</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.3334/ORNLDAAC/2056</idno>
+              <ptr target="https://doi.org/10.3334/ORNLDAAC/2056"/>
+              <title level="m">GEDI L4A Footprint Level Aboveground Biomass Density, Version</title>
+              <meeting>
+                <address>
+                  <addrLine>Oak Ridge, Tennessee, USA</addrLine>
+                </address>
+              </meeting>
+              <imprint>
+                <date type="published" when="2022">2022</date>
+              </imprint>
+              <respStmt>
+                <orgName>ORNL DAAC</orgName>
+              </respStmt>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b12">
+            <analytic>
+              <title level="a" type="main">Atlas of agriculture in the Lao PDR: patterns and trends between</title>
+              <author>
+                <persName>
+                  <forename type="first">M</forename>
+                  <surname>Epprecht</surname>
+                </persName>
+              </author>
+            </analytic>
+            <monogr>
+              <title level="m">and Ministry of Agriculture and Forestry</title>
+              <imprint>
+                <date type="published" when="1999">2018. 1999. 2011</date>
+              </imprint>
+              <respStmt>
+                <orgName>Centre for Development and Environment (CDE)University of Bern, Switzerland</orgName>
+              </respStmt>
+            </monogr>
+            <note>with Bern Open Publishing (BOP)) p 70 (available at: www.decide.k4d.la/files/en/5%20Crops%20-%20annual.pdf</note>
+          </biblStruct>
+          <biblStruct xml:id="b13">
+            <analytic>
+              <title level="a" type="main">Vegetation and ecosystem carbon recovery following shifting cultivation in Mizoram-Manipur-Kachin rainforest eco-region</title>
+              <author>
+                <persName>
+                  <forename type="first">A</forename>
+                  <surname>Gogoi</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">U</forename>
+                  <surname>Sahoo</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">H</forename>
+                  <surname>Saikia</surname>
+                </persName>
+              </author>
+              <idno type="DOI">10.1186/s13717-020-00225-w</idno>
+            </analytic>
+            <monogr>
+              <title level="j">Southern Asia Ecol. Process</title>
+              <imprint>
+                <biblScope unit="volume">9</biblScope>
+                <biblScope unit="page" from="1" to="13"/>
+                <date type="published" when="2020">2020</date>
+              </imprint>
+            </monogr>
+          </biblStruct>
+          <biblStruct xml:id="b14">
+            <monogr>
+              <title/>
+              <author>
+                <persName>
+                  <forename type="first">M</forename>
+                  <surname>Hansen</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">C</forename>
+                  <surname>Potapov</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">P</forename>
+                  <forename type="middle">V</forename>
+                  <surname>Moore</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">R</forename>
+                  <surname>Hancher</surname>
+                </persName>
+              </author>
+              <author>
+                <persName>
+                  <forename type="first">M</forename>
+                </persName>
+              </author>
+              <imprint/>
+            </monogr>
+          </biblStruct>
+        </listBibl>
+      </div>
+    </back>
+  </text>
+</TEI>
diff --git a/src/test/resources/org/pub2tei/document/document1.tei.xml b/src/test/resources/org/pub2tei/document/document1.tei.xml
new file mode 100644
index 0000000..f116e5c
--- /dev/null
+++ b/src/test/resources/org/pub2tei/document/document1.tei.xml
@@ -0,0 +1,540 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink">
+	<teiHeader xml:lang="en">
+		<fileDesc>
+			<titleStmt>
+				<title level="a" type="main">Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos</title>
+				<funder ref="#_H9GH7GZ">
+					<orgName type="full">unknown</orgName>
+				</funder>
+				<funder ref="#_Em9my3H">
+					<orgName type="full">Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan</orgName>
+				</funder>
+				<funder ref="#_Ya8bhFM">
+					<orgName type="full">NASA</orgName>
+				</funder>
+				<funder ref="#_aAjTpRR">
+					<orgName type="full">NASA Carbon Monitoring System</orgName>
+				</funder>
+			</titleStmt>
+			<publicationStmt>
+				<publisher/>
+				<availability status="unknown">
+					<licence/>
+				</availability>
+				<date type="published" when="2023-10-13">13 October 2023</date>
+			</publicationStmt>
+			<sourceDesc>
+				<biblStruct>
+					<analytic>
+						<author role="corresp">
+							<persName><forename type="first">Shijuan</forename><surname>Chen</surname></persName>
+							<email>shijuan.chen@yale.edu</email>
+							<affiliation key="aff0">
+								<orgName type="institution" key="instit1">Yale School of the Environment</orgName>
+								<orgName type="institution" key="instit2">Yale University</orgName>
+								<address>
+									<settlement>New Haven</settlement>
+									<region>CT</region>
+									<country key="US">United States of America</country>
+								</address>
+							</affiliation>
+							<affiliation key="aff1">
+								<orgName type="department">Department of Earth and Environment</orgName>
+								<orgName type="institution">Boston University</orgName>
+								<address>
+									<settlement>Boston</settlement>
+									<region>MA</region>
+									<country key="US">United States of America</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Curtis</forename><forename type="middle">E</forename><surname>Woodcock</surname></persName>
+							<affiliation key="aff1">
+								<orgName type="department">Department of Earth and Environment</orgName>
+								<orgName type="institution">Boston University</orgName>
+								<address>
+									<settlement>Boston</settlement>
+									<region>MA</region>
+									<country key="US">United States of America</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Thatheva</forename><surname>Saphangthong</surname></persName>
+							<affiliation key="aff2">
+								<orgName type="department" key="dep1">Department of Agriculture Land Management</orgName>
+								<orgName type="department" key="dep2">Ministry of Agriculture and Forestry</orgName>
+								<address>
+									<settlement>Vientiane</settlement>
+									<country>Laos</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Pontus</forename><surname>Olofsson</surname></persName>
+							<affiliation key="aff1">
+								<orgName type="department">Department of Earth and Environment</orgName>
+								<orgName type="institution">Boston University</orgName>
+								<address>
+									<settlement>Boston</settlement>
+									<region>MA</region>
+									<country key="US">United States of America</country>
+								</address>
+							</affiliation>
+							<affiliation key="aff3">
+								<orgName type="institution">NASA Marshall Space Flight Center</orgName>
+								<address>
+									<settlement>Huntsville</settlement>
+									<region>AL</region>
+									<country key="US">United States of America</country>
+								</address>
+							</affiliation>
+						</author>
+						<title level="a" type="main">Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos</title>
+					</analytic>
+					<monogr>
+						<imprint>
+							<date type="published" when="2023-10-13">13 October 2023</date>
+						</imprint>
+					</monogr>
+					<idno type="MD5">17112CCE7BFA5F63FB9BFE897A9E1A85</idno>
+					<idno type="DOI">10.1088/1748-9326/acffdd</idno>
+				</biblStruct>
+			</sourceDesc>
+		</fileDesc>
+		<encodingDesc>
+			<appInfo>
+				<application version="project.version" ident="GROBID" when="2024-04-26T10:06+0000">
+					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
+					<ref target="https://github.com/kermitt2/grobid"/>
+				</application>
+			</appInfo>
+		</encodingDesc>
+		<profileDesc>
+			<textClass>
+				<keywords>
+					<term>shifting cultivation</term>
+					<term>shifting agriculture</term>
+					<term>slash and burn</term>
+					<term>swidden agriculture</term>
+					<term>forest degradation</term>
+					<term>carbon emissions</term>
+					<term>deforestation</term>
+				</keywords>
+			</textClass>
+			<abstract>
+<div xmlns="http://www.tei-c.org/ns/1.0"><p>Although shifting cultivation is the major land use type in Laos, the spatial-temporal patterns and the associated carbon emissions of shifting cultivation in Laos are largely unknown. This study provides a nationwide analysis of the spatial-temporal patterns of shifting cultivation and estimations of the associated carbon emissions in Laos over the last three decades. This study found that shifting cultivation has been expanding and intensifying in Laos, especially in the last 5 years. The newly cultivated land from 2016 to 2020 accounted for 4.5% (±1.2%) of the total land area of Laos. Furthermore, the length of fallow periods has been continuously declining, indicating that shifting cultivation is becoming increasingly intensive. Combining biomass derived from Global Ecosystem Dynamics Investigation and shifting cultivation maps and area estimates, we found that the net carbon emissions from shifting cultivation declined in 2001-2015 but increased in 2016-2020. The largest carbon source is conversion from intact forests to shifting cultivation, which contributed to 89% of the total emissions from 2001 to 2020. In addition, there were increased emissions from intensified use of fallow lands. This research provides useful information for policymakers in Laos to understand the changes in shifting cultivation and improve land use management. This study not only supports Reducing Emissions from Deforestation and Forest Degradation reporting for Laos but also provides a methodology for tracking carbon emissions and removals of shifting cultivation.</p></div>
+			</abstract>
+		</profileDesc>
+	</teiHeader>
+	<text xml:lang="en">
+		<body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation. It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation. After short-term cultivation, the plot is abandoned, which allows the vegetation to recover. Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries <ref type="bibr">(Heinimann et al 2017</ref><ref type="bibr" target="#b7">, Curtis et al 2018</ref><ref type="bibr">, Jiang et al 2022)</ref>, such as Laos <ref type="bibr" target="#b5">(Chen et al 2023)</ref>, and the Democratic Republic of Congo <ref type="bibr">(Molinario et al 2015)</ref>. Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small. Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce.</p><p>Shifting cultivation has both short-term and long-term effects on carbon emissions <ref type="bibr">(Ziegler et al 2012)</ref>. In the short term, the slash-and-burn activities cause immediate release of carbon. In the long term, encroachment of shifting cultivation into primary forest and intensified use of secondary forest both lead to long-term increases in net carbon emissions and degradation of ecosystems. Carbon emissions from shifting cultivation have not been well quantified, because of the lack of methodology for monitoring shifting cultivation and tracking the associated carbon dynamics. In contrast to deforestation (such as urbanization), which does not involve carbon sequestration, shifting cultivation involves both carbon emissions associated with slash-and-burn activities and carbon sequestration during the fallow period. Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting.</p><p>In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system <ref type="bibr">(Roder 2001</ref><ref type="bibr" target="#b10">, Douangsavanh et al 2006</ref><ref type="bibr" target="#b12">, Epprecht et al 2018</ref><ref type="bibr">, Manivong and Cramb 2020)</ref> and the major driver of forest dynamics <ref type="bibr" target="#b7">(Curtis et al 2018</ref><ref type="bibr" target="#b5">, Chen et al 2023)</ref>. It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years <ref type="bibr" target="#b5">(Chen et al 2023)</ref>. Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020. Shifting cultivation activities are expected to increase due to the increasing demand for rice. Monitoring shifting cultivation and analyzing its patterns are important to understand the forest cover change in Laos and relevant to achieving Laos' goal of increasing forest cover to 70% (The current forest cover is 62%) (The Government of Lao PDR 2005). Since there were few spatially and temporally explicit maps and estimates of shifting cultivation before <ref type="bibr" target="#b5">Chen et al (2023)</ref>, carbon emissions from shifting cultivation have not been accurately estimated in the REDD+ reporting of Laos (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018).</p><p>Spatially and temporally explicit information about shifting cultivation in Laos was unavailable until recently <ref type="bibr" target="#b5">(Chen et al 2023)</ref>, and a comprehensive national-scale analysis of the spatial and temporal patterns of shifting cultivation has not been conducted to date. A traditional approach for mapping shifting cultivation is to create landscape mosaics based on a land cover map of a single year <ref type="bibr">(Messerli et al 2009</ref><ref type="bibr">, Silva et al 2011</ref><ref type="bibr">, Hett et al 2012</ref><ref type="bibr">, Hurni et al 2013a)</ref>. It is impossible to analyze the temporal patterns of shifting cultivation using this traditional approach. Another approach is to use multi-temporal land cover data to map shifting cultivation <ref type="bibr">(Leisz and Rasmussen 2012</ref><ref type="bibr">, Molinario et al 2015</ref><ref type="bibr">, Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018</ref><ref type="bibr" target="#b0">, Adhikary et al 2019</ref><ref type="bibr">, Kurien et al 2019)</ref>. In previous studies, the temporal resolution of the land cover maps was not high enough to support the analysis of temporal patterns <ref type="bibr">(Heinimann et al 2013)</ref>. Recently, <ref type="bibr" target="#b5">Chen et al (2023)</ref> used satellite data to create shifting cultivation products for Laos with sufficient temporal frequency (annual) and spatial resolution (30 m) to support a nationalscale spatial-temporal analysis. The recently launched GEDI (Global Ecosystem Dynamics Investigation) mission provides new opportunities for estimating biomass at a large scale <ref type="bibr">(Tang et al 2020)</ref>.</p><p>This study used the map products and reference data in <ref type="bibr" target="#b5">Chen et al (2023)</ref>, combined with GEDI, to conduct a national-scale analysis of the spatial and temporal patterns and carbon dynamics of shifting cultivation in Laos. The goal is to understand the spatial and temporal patterns of shifting cultivation and the associated carbon emissions, in support of decision-making to reduce carbon emissions and promote sustainable livelihoods depending on shifting cultivation.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Method</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Mapping shifting cultivation</head><p>Shifting cultivation was mapped using Landsat data from 1987 to 2020 on Google Earth Engine <ref type="bibr" target="#b5">(Chen et al 2023)</ref>. CCDC-SMA (continuous change detection and classification-spectral mixture analysis) (Zhu and Woodcock 2014, Bullock et al 2020, Chen et al 2021) was used to detect forest disturbances in Laos. CCDC-SMA fits harmonic models to fractions of endmembers and NDFI (Normalized Difference Fraction Index) <ref type="bibr">(Souza et al 2005)</ref> to monitor forest disturbances (figure <ref type="figure" target="#fig_0">1</ref>). Annual maps of Shifting Cultivation from 1991 to 2020 were created by combining time series analysis, object-based image analysis, and post-disturbed land-cover classification. A total of 1000 sample units under simple random sampling were used as reference data for accuracy assessment and area estimation. For each sample unit, at least two interpreters interpreted the land change class and the year of each slash-and-burn event by examining high-resolution satellite imagery and Landsat time series (figures 1(a), (c) and 2). During 1991-2020, shifting cultivation was the main type of forest disturbance in Laos, affecting 32.9 ± 1.9% of Laos <ref type="bibr" target="#b5">(Chen et al 2023)</ref>. Shifting cultivation was mapped with a producer's accuracy of 88% and a user's accuracy of 80% <ref type="bibr" target="#b5">(Chen et al 2023)</ref>. <ref type="bibr" target="#b5">Chen et al (2023)</ref> describes more details of the monitoring method. Both the map products and the reference data from <ref type="bibr" target="#b5">Chen et al (2023)</ref> were used in this study.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Spatial-temporal patterns of shifting cultivation</head><p>The annual maps of shifting cultivation  and the reference sample units interpreted as Shifting Cultivation were used to investigate the patterns of shifting cultivation. We estimated the area of shifting cultivation at different fallow and disturbance   55 ′ 27 ′′ E. In the time series plot, the blue points are the Landsat observations. In the Landsat images (Red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location.).</p><p>to explore whether the extent of shifting cultivation expanded, the newly and previously cultivated areas of shifting cultivation were estimated using reference sample units and maps in Chen et al (2023) for every 5 year period from 2000 to 2020. In the reference sample points, whether a pixel is newly or previously cultivated is determined by the year of slash-andburn recorded by the interpreters (e.g. figures 1(a), (c) and 2).</p><p>Furthermore, to investigate the change patterns in fallow length and cultivation length (length of cropping period), we visually interpreted Landsat time series, Landsat imagery, and high-resolution images for 196 sample points (figure <ref type="figure">3</ref> as an example). These sample points are the points with at least two cultivation events in the aforementioned reference data with 1000 simple random sample points. For each point, the year of slash and burn (land clearing), cultivation length, and fallow length are recorded for every event.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Carbon emission/removal</head><p>The GEDI mission provides space-borne LiDAR data to estimate aboveground biomass (Healey et al 2020). GEDI's L4A Footprint Level Aboveground Biomass Density (AGBD) (version 2.1) 25 m data <ref type="bibr" target="#b1">(Beck et al 2020</ref><ref type="bibr" target="#b11">, Dubayah et al 2022)</ref> were used to explore the effect of shifting cultivation on biomass. GEDI data collected in 2020 was used because it was the only year of data with good spatial coverage when the study was conducted. To overlay the GEDI footprint and Landsat, for each GEDI footprint, we extracted the value of the 30 m pixel in the Landsat-based map that has the largest overlap with the 25 m footprint. Only lidar observations with good quality (using the 'quality_flag' band and the 'degrade_flag' band) and collected at places with a slope less than 20 • and in the interior of shifting cultivation sites (excluding a two-pixel edge) were used, to eliminate the effect of terrain and possible misregistration at the edges of slash-and-burn events. The reason why we excluded lidar points with slopes larger than 20 • is that GEDIbased biomass estimates tend to be overestimated at steep terrain. AGBD was calculated for Active Shifting Cultivation, Inactive Shifting Cultivation, Intact Forest, and Others. Intact Forest here is defined as forests without significant anthropogenic disturbances. The relationship between AGBD and years of regrowth since the latest slash-and-burn events was analyzed. The hypothesis was that AGBD has a positive relationship with years of regrowth since the latest slash-andburn activity. From this relationship, a country-level growth curve of AGBD can be developed and used to estimate the biomass of fallow lands.</p><p>Carbon emissions from shifting cultivation were estimated for every 5 year period from 2001 to 2020. Table <ref type="table" target="#tab_1">1</ref> shows the activity classes, definitions, and emission factors. New Shifting Cultivation area was estimated from a sampling-based method The emission factors for activities other than New Shifting Cultivation are spatially explicit and were determined by the map of the latest year of slash and burn and the growth curve. Figure <ref type="figure" target="#fig_7">8</ref> shows an example of the spatially explicit emission factors for different activities. Specifically, this was how the carbon emissions and removals of Fallow land -&gt; Fallow land, Fallow land -&gt; Cleared land, and Cleared land -&gt; Fallow land were calculated: The latest year of disturbance of Fallow land was determined using the annual shifting cultivation maps. Then, the AGBD of fallow lands was calculated using equation (1). Using AGBD of fallow land in the end year minus AGBD in the start year of each period, the differences in AGBD were obtained. Multiply the differences in AGBD by the area of different activities and then multiply it by the conversion factor (0.5), and the carbon emissions and removals of each activity were calculated. The average emission/removal factors were calculated using the emissions and removals divided by the total area of activities in different categories.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Results</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Spatial-temporal patterns of shifting cultivation</head><p>A large proportion of the land used for shifting cultivation in Laos remains in use. During our study period, the estimated area of Active Shifting Cultivation (19.1 ± 1.6%) exceeded the area of Inactive Shifting Cultivation (13.7 ± 1.8%). In the future, there is a possibility of reusing Inactive Shifting Cultivation and further increasing the area of Active Shifting Cultivation, given the increasing demand for crops. New Shifting Cultivation, defined as shifting cultivation that first occurred in each period, was estimated from 2001 to 2020 by period (figure <ref type="figure" target="#fig_2">4</ref>). The area estimates were aggregated into 5 year periods instead of calculating annual to reduce uncertainties of the area estimates. From 1991 to 2000, it is difficult to tell whether the shifting cultivation areas were new or old, and thus this analysis started in 2001. In all 5 year periods, the area of New Shifting Cultivation is higher than 3% of Laos, implying that on average, over 0.6% of Laos' land area is converted from intact forest to shifting cultivation each year. Our results indicate that the extent of shifting cultivation has been expanding.</p><p>During 2001-2015, there was a decrease in the area of New Shifting Cultivation. However, both the area of New Shifting Cultivation and the total area of Shifting Cultivation have increased significantly in 2016-2020. The proportion of previously and newly cultivated to the total area of shifting cultivation was calculated for every year using the annual maps (figure <ref type="figure" target="#fig_3">5</ref>). Before 2007, the newly cultivated  areas were larger than the previously cultivated, and the trend reversed after 2007. There was a general decreasing trend in the proportion of New Shifting Cultivation, but increases were observed in 2019 and 2020. We suppose that the general decreasing trend is because intact forests available for cultivation decreased over time and previously cultivated land is easier to clear for future cultivation.</p><p>Based on the sample interpretation results, most cultivation lengths are either one year or two years. Although there are variations across the years, we have not seen major changes in average cultivation length (figures S1 and S2). The mean length of the fallow periods of shifting cultivation in Laos is 6.5 years, which is close to the length of fallow periods reported in the literature (7 years) (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018). The fallow length has been continuously declining (figure <ref type="figure" target="#fig_4">6</ref>). The reduction in the length of fallow periods indicates that shifting cultivation has intensified.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Growth curve of fallow lands</head><p>The AGBD was lower in shifting cultivation regions than in the intact forests. The median AGBD of Intact Forest, Inactive Shifting Cultivation, Active Shifting Cultivation, and Others are 151.9 Mg ha -1 , 87.9 Mg ha -1 , 39.5 Mg ha -1 , and 22.8 Mg ha -1 , respectively. The biomass of Inactive Shifting Cultivation only reached about 60% of that of the intact forest. In the literature (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018), the regions of Inactive Shifting Cultivation were considered to be 'recovered' , whereas our results show that the AGBD is not recovered even if these regions have been left for fallow for at least seven years.  To investigate the relationship between median AGBD and disturbance history, a logarithmic regression was conducted on years of regrowth since the latest slash-and-burn events and median AGBD of GEDI footprints (figure <ref type="figure" target="#fig_5">7</ref>). The logarithmic model of years of regrowth (x) and AGBD (y) is (R square is 0.93): y = 29.129 ln (x) + 9.907</p><p>(1)</p><p>AGBD was strongly correlated with years of regrowth. Equation ( <ref type="formula">1</ref>) and the maps of years of regrowth were used to calculate the biomass of fallow lands and spatially explicit emission/removal factors (figure <ref type="figure" target="#fig_7">8</ref>).    in this period (figure <ref type="figure" target="#fig_3">5</ref>) and the decrease in carbon sink of fallow lands in this period. For every period, New Shifting Cultivation is the largest carbon source, contributing to more than 80% of the total emissions. From 2001 to 2020, New Shifting Cultivation contributed to 89% of the total emissions. Fallow lands are important carbon sinks and sequestered about 70% of the total emissions during 2006-2015. However, carbon sequestration of fallow lands also decreased in recent years because of the intensified use of fallow land. To summarize, the increase in emissions from shifting cultivation encroachment to intact forests (New Shifting Cultivation) and intensified use of secondary forests both led to the recent increase in net emissions from shifting cultivation.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Carbon emissions from shifting cultivation</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Discussion</head><p>In this study, the spatial-temporal patterns and the carbon dynamics of shifting cultivation in Laos were analyzed. The results showed that shifting cultivation has been expanding and intensifying. The area of shifting cultivation has increased significantly over the last 5 years. The fallow length has been declining continuously, which indicates the intensification of shifting cultivation. Our finding of a reduction of fallow length is consistent with previous local studies <ref type="bibr">(Rasul and Thapa 2003</ref><ref type="bibr">, Saphangthong and Kono 2009</ref><ref type="bibr">, van Vliet et al 2012)</ref>. We found that AGBD was strongly correlated with years of regrowth since the latest year of slash-and-burn activities, which can be   <ref type="formula">2019</ref>) is understandable since their major focus is forest loss instead of shifting cultivation. This comparison is not a criticism of the aforementioned studies. Instead, it highlights the benefits of using shifting cultivation maps and reference samples with better spatial resolution and high temporal frequency for the analysis of spatial-temporal patterns.</p><p>We compared our area estimates of New Shifting Cultivation with the official forest change statistics from Laos (table <ref type="table" target="#tab_1">S1</ref>). The Laos official forest change maps (https://nfms.maf.gov.la/) are created from the land cover classification maps from the start year and end year for each period (see the periods in table <ref type="table" target="#tab_1">S1</ref>). Since shifting cultivation is the major driver of forest degradation and deforestation in Laos, we expect that there are some consistencies between the areas of New Shifting cultivation and the areas of forest degradation and deforestation. There are consistencies in the period <ref type="bibr">2006-2010 and 2011-2015, with</ref> the differences between our estimates and the official statistics both less than 1% of Laos. Our estimates of New Shifting Cultivation are generally higher than the Laos official estimates of deforestation and forest degradation, except for 2006-2010. This was partly due to the different monitoring approaches. Without using dense time series, the shifting cultivation events that occurred over five years may be difficult to detect using two classification maps from the start and the end. In the period <ref type="bibr">2001-2005 and 2016-2020, our</ref> estimates are about 2%-3% higher than the official estimates. For 2016-2020, the discrepancy is partly because the 2019 and 2020 changes are included in our estimates but not in the official statistics. Overall, our results and area estimates provide valuable information regarding the forest dynamics of Laos.</p><p>Furthermore, we compared the shifting cultivation map with the field survey data in the Laos National Forest Monitoring System https://nfms. maf.gov.la/. The shifting cultivation map was compared with 39 field points identified as <ref type="bibr">'Regenerating Vegetation' or 'Upland crop' in 2010</ref><ref type="bibr" target="#b12">, 2011</ref><ref type="bibr">, 2012</ref><ref type="bibr" target="#b0">, or 2019</ref>, since these two land cover classes are generally considered to have an association with shifting cultivation practices (Department of Forestry 2020). The 31 out of 39 (80%) points are correctly mapped as shifting cultivation.</p><p>As a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos, our research is valuable to sustainable land resource management. The sustainability of the land is negatively impacted by the recent expansion and intensification of shifting cultivation, indicated by an increase in newly cultivated areas in 2016-2020 and a reduction of fallow length in 1991-2020. Moreover, our research provides a quantitative analysis of carbon emissions of shifting cultivation, which is crucial for REDD+ reporting in Laos. Our research indicates that carbon emissions from shifting cultivation can be quantified by combining GEDI data with shifting cultivation maps and area estimates. The fallow land sequestrated a significant amount of carbon in the past, but this carbon sink declined in recent years. The recent increase in new shifting cultivation events also led to an increase in net carbon emissions. This highlights the importance of protecting the primary forest from the encroachment of new shifting cultivation and the restoration of old fallow lands.</p><p>Our study has several limitations and future research can make improvements by using more sophisticated models and integration with other data. The first limitation is the usage of GEDI data. Our research only used GEDI in one year (2020), because GEDI is a new mission and 2020 was the only year with good coverage data when the study was conducted. Future studies can use GEDI for multiple years as more data will be collected. In addition, we excluded GEDI points where the slope is larger than 20 • to avoid overestimation of biomass in steep terrain. This would introduce regional bias on the growth curve and emission factors. Based on our map, 69% of the shifting cultivation area is in places with slopes less than 20 • <ref type="bibr" target="#b4">(Chen 2022)</ref>. Future research should improve GEDI biomass estimates in steep terrain. Second, although we compared our map with some field survey data in Laos, the field data information for each location is limited. Future studies should collect more detailed information on shifting cultivation in field surveys, especially biomass in shifting cultivation landscapes (e.g. Salinas-Melgoza et al 2017, <ref type="bibr" target="#b2">Borah et al 2018</ref><ref type="bibr" target="#b13">, Gogoi et al 2020 )</ref>. Third, the carbon estimation only considered aboveground biomass change and no other carbon pools due to a lack of field survey data on those carbon pools. Future research can conduct field surveys on belowground biomass and include the belowground carbon pools in carbon emission estimation. Fourth, future research should investigate the causes of the recent increase in shifting cultivation, which requires field surveys.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Conclusion</head><p>Our research provides a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos. Our analysis shows that shifting cultivation in Laos has been expanding and intensifying, particularly in the recent 5 years. The practice of shifting cultivation has become increasingly intensive as the length of the fallow periods has been continuously shortening. Combining GEDI data with shifting cultivation maps and area estimates, carbon emissions from shifting cultivation can be quantified. The net carbon emissions from shifting cultivation declined in the past but increased recently. This study not only supports REDD+ reporting for Laos but also demonstrates a method of tracking carbon dynamics in shifting cultivation landscapes.</p><p>USGS Landsat Science Team Program for Better Use of the Landsat Temporal Domain: Monitoring Land Cover Type, Condition and Change (Grant Number: G12PC00070). The authors are grateful to the editors and two anonymous reviewers for their insightful and constructive comments, which greatly helped to improve this paper. </p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 .</head><label>1</label><figDesc>Figure 1. An example of active shifting cultivation in previously cultivated land (location: 20 • 7 ′ 13 ′′ N, 101 • 6 ′ 59 ′′ E). The shifting cultivation events in 2005 and 2018 were categorized as Previous Shifting Cultivation because shifting cultivation first occurred in 1991. This place is also Active Shifting Cultivation because the latest shifting cultivation event occurred in 2018. (a) Landsat time series. (b) CCDC-SMA model fits. Different colors show different segments and the model breaks in 1991, 2005, and 2018 show slash and burn events. The colored lines show the seasonality of the forest and the drops between lines show slash-and-burn events. (c) Landsat images and high-resolution images on Google Earth. In the Landsat images (red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location.</figDesc><graphic coords="3,129.32,72.89,414.04,288.65" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 .Figure 3 .</head><label>23</label><figDesc>Figure 2. An example of reference data (location: 20 • 15 ′ 8 ′′ N, 100• 39 ′ 51 ′′ E). This shifting cultivation is New Shifting Cultivation. The time series shows that no shifting cultivation occurred before 2019. The new shifting cultivation event occurred in 2019 and it can be verified by examining high-resolution images and Landsat images. (In the time series figure, the blue points are Landsat observations. In the Landsat images (red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location).</figDesc><graphic coords="3,129.32,467.97,414.04,194.32" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 4 .</head><label>4</label><figDesc>Figure 4. Area estimates and uncertainties of New Shifting Cultivation and total (new and previous) shifting cultivation by 5 year intervals.The y-axis is the area proportions of the total area of Laos (230 405 km 2 ). Any pixel that was newly cultivated at any time within a specified 5 year period would be counted and added to the total height of the corresponding pink bars.</figDesc><graphic coords="6,200.85,72.89,270.04,212.12" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 5 .</head><label>5</label><figDesc>Figure 5. Annual proportion of slash-and-burn areas in previously and newly cultivated regions.</figDesc><graphic coords="6,200.93,355.44,269.88,140.93" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 6 .</head><label>6</label><figDesc>Figure 6. Average fallow length by year calculated from sample interpretation.</figDesc><graphic coords="7,129.32,72.89,414.04,187.04" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 7 .</head><label>7</label><figDesc>Figure 7. Growth curve of aboveground biomass density.</figDesc><graphic coords="7,200.85,315.86,270.04,261.63" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6"><head/><label/><figDesc>Carbon emissions from shifting cultivation were estimated by period from 2001 to 2020 (tables 2-4, and figure 9). The net carbon emissions of shifting cultivation declined during 2001-2015, but significantly increased during 2016-2020. The decline in net carbon emissions during 2001-2015 is mostly because the area of new shifting cultivation decreased in this period. The increase during 2016-2020 is mostly due to the increase in new shifting cultivation activities</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_7"><head>Figure 8 .</head><label>8</label><figDesc>Figure 8. Spatially explicit emission (+)/removal (-) factors for different activities in a region (the background image is the high-resolution image): (a) fallow land -&gt; fallow land; (b) fallow land -&gt; cleared land; (c) cleared land -&gt; fallow land; (d) total of (a)-(c).</figDesc><graphic coords="8,200.85,72.89,270.04,254.35" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_8"><head>Figure 9 .</head><label>9</label><figDesc>Figure 9. Carbon dynamics by period.</figDesc><graphic coords="9,200.85,247.38,270.04,154.19" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0"><head/><label/><figDesc/><graphic coords="4,129.32,72.89,414.04,284.93" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 1 .</head><label>1</label><figDesc>Activity classes, definitions, and carbon emission/removal factors for each 5 year period (CF: conversion factor to convert biomass to carbon equivalents, CF = 0.5).</figDesc><table><row><cell>Activity class</cell><cell>Definition</cell><cell>Emission/removal factors</cell></row><row><cell>Intact forest -&gt; shifting cultivation</cell><cell>No shifting cultivation before. Previous intact</cell><cell>Biomass of forest before</cell></row><row><cell>(New Shifting Cultivation)</cell><cell>forests began to be used for new shifting</cell><cell>new shifting</cell></row><row><cell/><cell>cultivation.</cell><cell>cultivation × CF (75.95</cell></row><row><cell/><cell/><cell>Mg C ha -1 )</cell></row><row><cell>Fallow land -&gt; fallow land</cell><cell>Shifting cultivation occurred before. The start</cell><cell>(Fallow land biomass in the</cell></row><row><cell/><cell>and end land cover were both fallow lands.</cell><cell>start -fallow land biomass</cell></row><row><cell/><cell/><cell>in the end) × CF</cell></row><row><cell>Fallow land -&gt; cleared land</cell><cell>In previously cultivated land, fallow land became</cell><cell>(Fallow land</cell></row><row><cell/><cell>cleared land.</cell><cell>biomass -cleared land</cell></row><row><cell/><cell/><cell>biomass) × CF</cell></row><row><cell>Cleared land -&gt; fallow land</cell><cell>In previously cultivated land, cleared land became</cell><cell>(Cleared land</cell></row><row><cell/><cell>fallow land.</cell><cell>biomass -fallow land</cell></row><row><cell/><cell/><cell>biomass) × CF</cell></row><row><cell>Cleared land -&gt; cleared land</cell><cell>In previously cultivated land, cleared land became</cell><cell/></row><row><cell/><cell>cleared land.</cell><cell/></row></table><note><p><p><p><p>Zero</p>and other activity classes in table 1 were estimated from the maps. This is because the samplingbased area estimates of New Shifting Cultivation adjusted errors in mapping and are more accurate than pixel-counting from the maps</p>(Olofsson  et al 2013(Olofsson  et al  , 2014))</p>. The area estimates of New Shifting Cultivation were calculated by 5-year periods with low uncertainty. For other activity classes, it is difficult to get area estimates from the reference data while including the dynamics of biomass of fallow land, and thus we used a spatially explicit method. In table 1, the biomass of the forest before disturbance was the biomass of Intact Forest estimated from GEDI. The biomass of fallow land was estimated from the growth curve developed from GEDI based on years since disturbance. Years since disturbance for each pixel was obtained from the annual maps of shifting cultivation. The cleared land biomass was estimated as the biomass of non-forest by the Department of Forestry (2020) based on field surveys. The emission factor of New Shifting Cultivation is 75.95 Mg C ha -1 . The emission factor of Cleared land -&gt; Cleared land is zero.</p></note></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 2 .</head><label>2</label><figDesc>Area of difference land use activities for each period (5 years).</figDesc><table><row><cell>Area (ha)</cell><cell>2001-2005</cell><cell>2006-2010</cell><cell>2011-2015</cell><cell>2016-2020</cell></row><row><cell>Fallow land -&gt; fallow land</cell><cell>2379 847</cell><cell>3809 008</cell><cell>5213 561</cell><cell>6009 880</cell></row><row><cell>Fallow land -&gt; cleared land</cell><cell>226 240</cell><cell>361 992</cell><cell>397 236</cell><cell>630 467</cell></row><row><cell>Cleared land -&gt; fallow land</cell><cell>441 757</cell><cell>768 342</cell><cell>748 692</cell><cell>696 501</cell></row><row><cell>New shifting cultivation</cell><cell>1198 106</cell><cell>806 418</cell><cell>714 256</cell><cell>1036 823</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 3 .</head><label>3</label><figDesc>The country-average emissions or removal factors for each period (5 years). The original emission or removal factors except for new shifting cultivation are spatially explicit. This table shows the country averages of the spatial explicit emission or removal factors.</figDesc><table><row><cell>Average emission/removal factors (Mg C/ha -1 )</cell><cell>2001-2005</cell><cell>2006-2010</cell><cell>2011-2015</cell><cell>2016-2020</cell></row><row><cell>Fallow land -&gt; fallow land</cell><cell>-8.06</cell><cell>-7.57</cell><cell>-5.65</cell><cell>-1.56</cell></row><row><cell>Fallow land -&gt; cleared land</cell><cell>18.70</cell><cell>19.26</cell><cell>23.58</cell><cell>26.10</cell></row><row><cell>Cleared land -&gt; fallow land</cell><cell>-23.14</cell><cell>-24.28</cell><cell>-23.67</cell><cell>-21.36</cell></row><row><cell>New shifting cultivation</cell><cell>75.95</cell><cell>75.95</cell><cell>75.95</cell><cell>75.95</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 4 .</head><label>4</label><figDesc>Carbon emissions (+) and removals (-) of different activities for each period (5 years).</figDesc><table><row><cell>Carbon emission/removal</cell><cell/><cell/><cell/><cell/></row><row><cell>(Mg C)</cell><cell>2001-2005</cell><cell>2006-2010</cell><cell>2011-2015</cell><cell>2016-2020</cell></row><row><cell>Fallow land -&gt; fallow</cell><cell>-19 175 009</cell><cell>-28 833 216</cell><cell>-29 440 602</cell><cell>-9348 118</cell></row><row><cell>land</cell><cell/><cell/><cell/><cell/></row><row><cell>Fallow land -&gt; cleared</cell><cell>4230 290</cell><cell>6970 956</cell><cell>9366 236</cell><cell>16 452 893</cell></row><row><cell>land</cell><cell/><cell/><cell/><cell/></row><row><cell>Cleared land -&gt; fallow</cell><cell>-10 222 046</cell><cell>-18 657 539</cell><cell>-17 717 827</cell><cell>-14 879 752</cell></row><row><cell>land</cell><cell/><cell/><cell/><cell/></row><row><cell>New shifting cultivation</cell><cell>90 996 151</cell><cell>61 247 409</cell><cell>54 247 705</cell><cell>78 746 669</cell></row><row><cell>Period total (net</cell><cell>65 829 387</cell><cell>20 727 610</cell><cell>16 455 512</cell><cell>70 971 692</cell></row><row><cell>emission/removal)</cell><cell/><cell/><cell/><cell/></row><row><cell>Annual average</cell><cell>13 165 877</cell><cell>4145 522</cell><cell>3291 103</cell><cell>14 194 339</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5"><head/><label/><figDesc>Saphangthong T and Kono Y 2009 Continuity and discontinuity in land use changes: a case study in Northern Lao villages J. Southeast Asian Stud. 47 263-86 Silva J M N, Carreiras J M B, Rosa I and Pereira J M C 2011 Greenhouse gas emissions from shifting cultivation in the tropics, including uncertainty and sensitivity analysis J. Geophys. Res. Atmos. 116 1-21 Souza C M, Roberts D A and Cochrane M A 2005 Combining spectral and spatial information to map canopy damage from selective logging and forest fires Remote Sens. Environ. 98 329-43 Tang X, Hutyra L R, Arévalo P, Baccini A, Woodcock C E and Olofsson P 2020 Spatiotemporal tracking of carbon emissions and uptake using time series analysis of Landsat data: a spatially explicit carbon bookkeeping model Sci. Total Environ. 720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf) van Vliet N et al 2012 Trends, drivers and impacts of changes in swidden cultivation in tropical forest-agriculture frontiers: a global assessment Glob. Environ. Change 22 418-29 World Bank 2023 World Bank open data (available at: https:// data.worldbank.org/) Zhu Z and Woodcock C E 2014 Continuous change detection and classification of land cover using all available Landsat data Remote Sens. Environ. 144 152-71 Ziegler A D et al 2012 Carbon outcomes of major land-cover transitions in SE Asia: great uncertainties and REDD+ policy implications Glob. Change Biol. 18 3087-99</figDesc><table/></figure>
+		</body>
+		<back>
+
+			<div type="acknowledgement">
+<div><head>Acknowledgments</head><p>This research was funded by the <rs type="funder">NASA</rs> <rs type="programName">Land-Cover and Land-Use Change Program</rs> (Grant Number: <rs type="grantNumber">80NSSC18K0315</rs>), the <rs type="funder">NASA Carbon Monitoring System</rs> (Grant Number: <rs type="grantNumber">80NSSC20K0022</rs>), and Turubanova S A A, Tyukavina A and Kommareddy A 2013 High-resolution global maps of 21st-century forest cover change Science 342 850-3 Healey S P, Yang Z, <rs type="projectName">Gorelick N and Ilyushchenko S 2020 Highly local model calibration with a new GEDI LiDAR</rs> asset on Google Earth Engine reduces Landsat forest height signal saturation Remote Sens. 12 2840 <rs type="affiliation">Heinimann A, Hett C, Hurni K, Messerli P</rs>, Epprecht M, Jørgensen L and Breu T 2013 Socio-economic perspectives on shifting cultivation landscapes in <rs type="affiliation">Northern Laos Hum. Ecol. 41 51-62 Heinimann A, Mertz O, Frolking S, Christensen A E, Hurni K, Sedano F, Chini L P, Sahajpal R, Hansen</rs> M and Hurtt G 2017 A global view of shifting cultivation: recent, current, and future extent PLoS One 12 1-22 Hett C, Castella J C, Heinimann A, Messerli P and Pfund J L 2012 A landscape mosaics approach for characterizing swidden systems from a REDD+ perspective Appl. Geogr. 32 608-18 Hurni K, Hett C, Epprecht M, Messerli P and Heinimann A 2013a A texture-based land cover classification for the delineation of a shifting cultivation landscape in the Lao PDR using landscape metrics Remote Sens. 5 3377-96 Jiang N, Li P and Feng Z 2022 Remote sensing of swidden agriculture in the tropics: a review Int. J. Appl. Earth Obs. Geoinf. 112 102876 <rs type="person">Kurien A J, Lele</rs> S and Nagendra H 2019 Farms or forests? Understanding and mapping shifting cultivation using the case study of <rs type="person">West Garo Hills</rs>, <rs type="person">India Land 8 133 Leisz S J and Rasmussen M S 2012 Mapping</rs> fallow lands in Vietnam's north-central mountains using yearly Landsat imagery and a land-cover succession model Int. J. Remote Sens. 33 6281-303 Manivong V and <rs type="person">Cramb R 2020 From</rs> subsistence to commercial rice production in Laos White Gold: The <rs type="funder">Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan</rs>) pp 103-19 Messerli P, Heinimann A and Epprecht M 2009 Finding homogeneity in heterogeneity-a new approach to quantifying landscape mosaics developed for the <rs type="institution">Lao PDR Hum</rs>. Ecol. 37 291-304 Molinario G, Hansen M C and Potapov P V 2015 Forest cover dynamics of shifting cultivation in the Democratic Republic of Congo: a remote sensing-based assessment for 2000-2010 Environ. Res. Lett. 10 094009 Olofsson P, <rs type="person">Foody G M</rs>, <rs type="person">Herold M, Stehman S V</rs>, Woodcock C E and Wulder M A 2014 Good practices for estimating area and assessing accuracy of land change Remote Sens. Environ. 148 42-57 Olofsson P, <rs type="person">Foody G M, Stehman S V</rs> and Woodcock C E 2013 Making better use of accuracy data in land change studies: estimating accuracy and area and quantifying uncertainty using stratified estimation Remote Sens. Environ. 129 122-31 Potapov P et al 2019 Annual continuous fields of woody vegetation structure in the Lower Mekong region from 2000-2017 Landsat time-series Remote Sens. Environ. 232 111278 Rasul G and Thapa G B 2003 Shifting cultivation in the mountains of South and Southeast Asia: regional patterns and factors influencing the change <rs type="person">Land Degrad. Dev</rs>. <rs type="grantNumber">14 495-508</rs> Roder W 2001 <rs type="projectName">Slash-And-Burn Rice Systems in The Hills of Northern Lao PDR: Description, Challenges, And Opportunities (International Rice Research Institute</rs>) (available at: http://lad.nafri.org.la/fulltext/231-0.pdf) <rs type="projectName">Salinas-Melgoza M A, Skutsch M, Lovett J C and Borrego A 2017 Carbon emissions from dryland shifting cultivation: a case study of Mexican tropical dry forest Silva Fenn</rs>. <rs type="grantNumber">51</rs> 1553</p></div>
+			</div>
+			<listOrg type="funding">
+				<org type="funding" xml:id="_Ya8bhFM">
+					<idno type="grant-number">80NSSC18K0315</idno>
+					<orgName type="program" subtype="full">Land-Cover and Land-Use Change Program</orgName>
+				</org>
+				<org type="funded-project" xml:id="_aAjTpRR">
+					<idno type="grant-number">80NSSC20K0022</idno>
+					<orgName type="project" subtype="full">Gorelick N and Ilyushchenko S 2020 Highly local model calibration with a new GEDI LiDAR</orgName>
+				</org>
+				<org type="funded-project" xml:id="_Em9my3H">
+					<idno type="grant-number">14 495-508</idno>
+					<orgName type="project" subtype="full">Slash-And-Burn Rice Systems in The Hills of Northern Lao PDR: Description, Challenges, And Opportunities (International Rice Research Institute</orgName>
+				</org>
+				<org type="funded-project" xml:id="_H9GH7GZ">
+					<idno type="grant-number">51</idno>
+					<orgName type="project" subtype="full">Salinas-Melgoza M A, Skutsch M, Lovett J C and Borrego A 2017 Carbon emissions from dryland shifting cultivation: a case study of Mexican tropical dry forest Silva Fenn</orgName>
+				</org>
+			</listOrg>
+
+			<div type="availability">
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Data availability statements</head><p>Google Earth Engine applications to visualize the datasets: https://github.com/shijuanchen/shift_cult Map products visualization: https://sites.google. com/view/shijuanchen/research/shift_cult</p><p>The data that support the findings of this study are openly available at the following URL/DOI: https:// doi.org/10.5281/zenodo.7782782.</p></div>
+			</div>
+
+			<div type="references">
+
+				<listBibl>
+
+<biblStruct xml:id="b0">
+	<analytic>
+		<title level="a" type="main">Land use and land cover dynamics with special emphasis on shifting cultivation in Eastern Ghats Highlands of India using remote sensing data and GIS Environ</title>
+		<author>
+			<persName><forename type="first">P</forename><forename type="middle">P</forename><surname>Adhikary</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">D</forename><surname>Barman</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Madhu</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C J</forename><surname>Dash</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Jakhar</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">H</forename><forename type="middle">C</forename><surname>Hombegowda</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">B</forename><forename type="middle">S</forename><surname>Naik</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">D</forename><surname>Sahoo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><surname>Beer</surname></persName>
+		</author>
+		<idno type="DOI">10.1007/s10661-019-7447-7</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Monit. Assess</title>
+		<imprint>
+			<biblScope unit="volume">191</biblScope>
+			<biblScope unit="page" from="1" to="15"/>
+			<date type="published" when="2019">2019</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+	<monogr>
+		<author>
+			<persName><forename type="first">J</forename><surname>Beck</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">J</forename><surname>Armston</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Hofton</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Luthcke</surname></persName>
+		</author>
+		<ptr target="https://daac.ornl.gov/GEDI/guides/GEDI_L4A_AGB_Density_V2_1"/>
+		<title level="m">Global Ecosystem Dynamics Investigation (GEDI) Level 02 User Guide</title>
+		<imprint>
+			<publisher>EROS Center, US Geological Survey</publisher>
+			<date type="published" when="2020">2020</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+	<analytic>
+		<title level="a" type="main">Quantifying carbon stocks in shifting cultivation landscapes under divergent management scenarios relevant to</title>
+		<author>
+			<persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Borah</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><surname>Evans</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">D P</forename><surname>Edwards</surname></persName>
+		</author>
+		<idno type="DOI">10.1002/eap.1764</idno>
+	</analytic>
+	<monogr>
+		<title level="j">REDD+ Ecol. Appl</title>
+		<imprint>
+			<biblScope unit="volume">28</biblScope>
+			<biblScope unit="page" from="1581" to="1593"/>
+			<date type="published" when="2018">2018</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+	<monogr>
+		<title level="m" type="main">Monitoring tropical forest degradation using spectral unmixing and Landsat time series analysis Remote Sens</title>
+		<author>
+			<persName><forename type="first">E</forename><surname>Bullock</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">L</forename><surname>Woodcock</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><surname>Olofsson</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename></persName>
+		</author>
+		<idno type="DOI">10.1016/j.rse.2018.11.011</idno>
+		<imprint>
+			<date type="published" when="2020">2020</date>
+			<publisher>Environ</publisher>
+			<biblScope unit="volume">238</biblScope>
+			<biblScope unit="page">110968</biblScope>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+	<monogr>
+		<title level="m" type="main">Satellite-based monitoring, attribution, and analysis of forest degradation Doctoral Dissertation</title>
+		<author>
+			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
+		</author>
+		<ptr target="https://open.bu.edu/handle/2144/46368"/>
+		<imprint>
+			<date type="published" when="2022">2022</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+	<monogr>
+		<title level="m" type="main">Monitoring shifting cultivation in Laos with Landsat time series Remote Sens</title>
+		<author>
+			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Olofsson</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><surname>Saphangthong</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C E</forename><surname>Woodcock</surname></persName>
+		</author>
+		<idno type="DOI">10.1016/j.rse.2023.113507</idno>
+		<imprint>
+			<date type="published" when="2023">2023</date>
+			<publisher>Environ</publisher>
+			<biblScope unit="volume">288</biblScope>
+			<biblScope unit="page">113507</biblScope>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+	<monogr>
+		<title level="m" type="main">Monitoring temperate forest degradation on Google Earth Engine using Landsat time series analysis Remote Sens</title>
+		<author>
+			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><forename type="middle">E</forename><surname>Woodcock</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">E L</forename><surname>Bullock</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Arévalo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Torchinava</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Peng</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Olofsson</surname></persName>
+		</author>
+		<idno type="DOI">10.1016/j.rse.2021.112648</idno>
+		<imprint>
+			<date type="published" when="2021">2021</date>
+			<publisher>Environ</publisher>
+			<biblScope unit="volume">265</biblScope>
+			<biblScope unit="page">112648</biblScope>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+	<analytic>
+		<title/>
+		<author>
+			<persName><forename type="first">P G</forename><surname>Curtis</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Slay</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">N L</forename><surname>Harris</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Tyukavina</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M C</forename><surname>Hansen</surname></persName>
+		</author>
+		<idno type="DOI">10.1126/science.aau3445</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Classifying drivers of global forest loss Science</title>
+		<imprint>
+			<biblScope unit="volume">361</biblScope>
+			<biblScope unit="page" from="1108" to="1111"/>
+			<date type="published" when="2018">2018</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+	<monogr>
+		<title level="m" type="main">Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018 Lao PDR's forest reference emission level and forest reference level for REDD+ results payment under the UNFCCC</title>
+		<ptr target="https://redd.unfccc.int/files/2018_frel_submission_laopdr.pdf"/>
+		<imprint/>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+	<monogr>
+		<title level="m" type="main">Lao People's Democratic Republic 1st national REDD+ results report for REDD+ results-based-payment under the UNFCCC</title>
+		<ptr target="http://dof.maf.gov.la/download/1st_national_redd_results_and_supporting_docs/LaoPDR_REDD-resultsUNFCCC_20200720_combined.pdf"/>
+		<imprint>
+			<date type="published" when="2020">2020</date>
+		</imprint>
+		<respStmt>
+			<orgName>Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR</orgName>
+		</respStmt>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+	<analytic>
+		<title level="a" type="main">Food security of shifting cultivation systems: case studies from Luang Prabang and Oudomxay Provinces</title>
+		<author>
+			<persName><forename type="first">L</forename><surname>Douangsavanh</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Polthanee</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">R</forename><surname>Katawatin</surname></persName>
+		</author>
+		<idno type="DOI">10.1007/s11629-006-0048-2.pdf</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Lao PDR J. Mt. Sci</title>
+		<imprint>
+			<biblScope unit="volume">3</biblScope>
+			<biblScope unit="page" from="48" to="57"/>
+			<date type="published" when="2006">2006</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+	<monogr>
+		<author>
+			<persName><forename type="first">R</forename><surname>Dubayah</surname></persName>
+		</author>
+		<idno type="DOI">10.3334/ORNLDAAC/2056</idno>
+		<ptr target="https://doi.org/10.3334/ORNLDAAC/2056"/>
+		<title level="m">GEDI L4A Footprint Level Aboveground Biomass Density, Version</title>
+		<meeting><address><addrLine>Oak Ridge, Tennessee, USA</addrLine></address></meeting>
+		<imprint>
+			<date type="published" when="2022">2022</date>
+		</imprint>
+		<respStmt>
+			<orgName>ORNL DAAC</orgName>
+		</respStmt>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+	<analytic>
+		<title level="a" type="main">Atlas of agriculture in the Lao PDR: patterns and trends between</title>
+		<author>
+			<persName><forename type="first">M</forename><surname>Epprecht</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="m">and Ministry of Agriculture and Forestry</title>
+		<imprint>
+			<date type="published" when="1999">2018. 1999. 2011</date>
+		</imprint>
+		<respStmt>
+			<orgName>Centre for Development and Environment (CDE)University of Bern, Switzerland</orgName>
+		</respStmt>
+	</monogr>
+	<note>with Bern Open Publishing (BOP)) p 70 (available at: www.decide.k4d.la/files/en/5%20Crops%20-%20annual.pdf</note>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+	<analytic>
+		<title level="a" type="main">Vegetation and ecosystem carbon recovery following shifting cultivation in Mizoram-Manipur-Kachin rainforest eco-region</title>
+		<author>
+			<persName><forename type="first">A</forename><surname>Gogoi</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">U</forename><surname>Sahoo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">H</forename><surname>Saikia</surname></persName>
+		</author>
+		<idno type="DOI">10.1186/s13717-020-00225-w</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Southern Asia Ecol. Process</title>
+		<imprint>
+			<biblScope unit="volume">9</biblScope>
+			<biblScope unit="page" from="1" to="13"/>
+			<date type="published" when="2020">2020</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+	<monogr>
+		<title/>
+		<author>
+			<persName><forename type="first">M</forename><surname>Hansen</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><surname>Potapov</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><forename type="middle">V</forename><surname>Moore</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">R</forename><surname>Hancher</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename></persName>
+		</author>
+		<imprint/>
+	</monogr>
+</biblStruct>
+
+				</listBibl>
+			</div>
+		</back>
+	</text>
+</TEI>
\ No newline at end of file
diff --git a/src/test/resources/org/pub2tei/document/document2.segmented.tei.xml b/src/test/resources/org/pub2tei/document/document2.segmented.tei.xml
new file mode 100644
index 0000000..bc0e4b8
--- /dev/null
+++ b/src/test/resources/org/pub2tei/document/document2.segmented.tei.xml
@@ -0,0 +1,760 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink">
+	<teiHeader xml:lang="en">
+		<fileDesc>
+			<titleStmt>
+				<title level="a" type="main">Science and Technology of Advanced Materials: Methods</title>
+				<funder>
+					<orgName type="full">MEXT</orgName>
+				</funder>
+				<funder ref="#_zeDNERp">
+					<orgName type="full">unknown</orgName>
+				</funder>
+				<funder>
+					<orgName type="full">Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)</orgName>
+				</funder>
+			</titleStmt>
+			<publicationStmt>
+				<publisher/>
+				<availability status="unknown">
+					<licence/>
+				</availability>
+				<date type="published" when="2023-12-14">14 Dec 2023.</date>
+			</publicationStmt>
+			<sourceDesc>
+				<biblStruct>
+					<analytic>
+						<author>
+							<persName><forename type="first">Luca</forename><surname>Foppiano</surname></persName>
+							<idno type="ORCID">0000-0002-6114-6164</idno>
+							<affiliation key="aff0">
+								<orgName type="department">Materials Modelling Group</orgName>
+								<orgName type="laboratory">Centre for Basic Research on Materials</orgName>
+								<orgName type="institution" key="instit1">Data-driven Materials Research Field</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+							<affiliation key="aff1">
+								<orgName type="department" key="dep1">Knowledge and Data Engineering</orgName>
+								<orgName type="department" key="dep2">Centre for Computational Sciences</orgName>
+								<orgName type="institution">University of Tsukuba</orgName>
+								<address>
+									<settlement>Tsukuba</settlement>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Tomoya</forename><surname>Mato</surname></persName>
+							<idno type="ORCID">0000-0002-0918-6468</idno>
+							<affiliation key="aff0">
+								<orgName type="department">Materials Modelling Group</orgName>
+								<orgName type="laboratory">Centre for Basic Research on Materials</orgName>
+								<orgName type="institution" key="instit1">Data-driven Materials Research Field</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Kensei</forename><surname>Terashima</surname></persName>
+							<idno type="ORCID">0000-0003-0375-3043</idno>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Pedro</forename><surname>Ortiz Suarez</surname></persName>
+							<idno type="ORCID">0000-0003-0343-8852</idno>
+							<affiliation key="aff3">
+								<orgName type="department">GmbH DFKI</orgName>
+								<orgName type="institution">CONTACT Luca Foppiano</orgName>
+								<address>
+									<addrLine>Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii</addrLine>
+									<postCode>0000-0003-0375</postCode>
+									<settlement>Berlin</settlement>
+									<region>DE</region>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Taku</forename><surname>Tou</surname></persName>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Chikako</forename><surname>Sakai</surname></persName>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Wei-Sheng</forename><surname>Wang</surname></persName>
+							<idno type="ORCID">0009-0001-3572-5736</idno>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Toshiyuki</forename><surname>Amagasa</surname></persName>
+							<idno type="ORCID">0000-0003-0595-2230</idno>
+							<affiliation key="aff1">
+								<orgName type="department" key="dep1">Knowledge and Data Engineering</orgName>
+								<orgName type="department" key="dep2">Centre for Computational Sciences</orgName>
+								<orgName type="institution">University of Tsukuba</orgName>
+								<address>
+									<settlement>Tsukuba</settlement>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Yoshihiko</forename><surname>Takano</surname></persName>
+							<idno type="ORCID">0000-0002-1541-6928</idno>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author role="corresp">
+							<persName><forename type="first">Masashi</forename><surname>Ishii</surname></persName>
+							<email>ishii.masashi@nims.go.jp</email>
+							<idno type="ORCID">0000-0003-0357-2832</idno>
+							<affiliation key="aff0">
+								<orgName type="department">Materials Modelling Group</orgName>
+								<orgName type="laboratory">Centre for Basic Research on Materials</orgName>
+								<orgName type="institution" key="instit1">Data-driven Materials Research Field</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><surname>Masashi</surname></persName>
+						</author>
+						<title level="a" type="main">Science and Technology of Advanced Materials: Methods</title>
+					</analytic>
+					<monogr>
+						<idno type="ISSN">Print</idno>
+						<imprint>
+							<date type="published" when="2023-12-14">14 Dec 2023.</date>
+						</imprint>
+					</monogr>
+					<idno type="MD5">DCB0425EE18794E34CC3A3075E3E3975</idno>
+					<idno type="DOI">10.1080/27660400.2023.2286219</idno>
+					<note type="submission">Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023</note>
+				</biblStruct>
+			</sourceDesc>
+		</fileDesc>
+		<encodingDesc>
+			<appInfo>
+				<application version="project.version" ident="GROBID" when="2024-04-26T11:25+0000">
+					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
+					<ref target="https://github.com/kermitt2/grobid"/>
+				</application>
+			</appInfo>
+		</encodingDesc>
+		<profileDesc>
+			<textClass>
+				<keywords>
+					<term>Materials informatics</term>
+					<term>superconductors</term>
+					<term>machine learning</term>
+					<term>database</term>
+					<term>TDM</term>
+				</keywords>
+			</textClass>
+			<abstract>
+<div xmlns="http://www.tei-c.org/ns/1.0"><p><s>We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon.</s><s>Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work.</s><s>This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections.</s><s>Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples.</s><s>For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer.</s><s>We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'.</s><s>Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>IMPACT STATEMENT</head><p><s>This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon.</s><s>We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.</s></p></div>
+			</abstract>
+		</profileDesc>
+	</teiHeader>
+	<text xml:lang="en">
+		<body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p><s>The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>.</s><s>This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties.</s><s>As a matter of course, such an approach requires a larger amount of material-related data for training models.</s><s>Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project <ref type="bibr" target="#b2">[3]</ref>, JARVIS (Joint Automated Repository for Various Integrated Simulations) <ref type="bibr" target="#b3">[4]</ref>, NOMAD (Novel Materials Discovery) <ref type="bibr" target="#b4">[5]</ref>, that played a role of a strong driving force for the development of materials informatics.</s><s>Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects.</s><s>Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality.</s><s>Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments.</s><s>This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.</s></p><p><s>On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science <ref type="bibr" target="#b5">[6]</ref>.</s><s>Currently, only a few limited resources exist, such as the Pauling File <ref type="bibr" target="#b6">[7]</ref> and SuperCon <ref type="bibr" target="#b7">[8]</ref>, necessitating reliance on manual extraction methods.</s><s>This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.</s></p><p><s>The SuperCon database was built manually from 1987 <ref type="bibr" target="#b7">[8]</ref> by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors <ref type="bibr" target="#b8">[9]</ref><ref type="bibr" target="#b9">[10]</ref><ref type="bibr" target="#b10">[11]</ref><ref type="bibr" target="#b11">[12]</ref>.</s><s>However, the updates of SuperCon have become increasingly challenging due to the high publication rate.</s><s>In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications.</s><s>This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days <ref type="bibr" target="#b0">[1]</ref>.</s><s>Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process.</s><s>We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record.</s><s>At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information.</s><s>There are several tools for data annotation, such as Inception <ref type="bibr" target="#b12">[13]</ref>, and Doccano <ref type="bibr" target="#b13">[14]</ref> which concentrate on text labelling and classification.</s></p><p><s>In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file.</s><s>We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.</s></p><p><s>Our contributions can be summarised as follows:</s></p><p><s>• We developed a workflow and a user interface that allow the curation of a machine-collected database.</s><s>We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach.</s><s>The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it.</s><s>Finally, we discuss our evaluation experiments and results in Section 4.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Curation workflow</head><p><s>The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure <ref type="figure" target="#fig_0">1</ref>).</s><s>Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else.</s><s>When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.</s></p><p><s>Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).</s></p><p><s>Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Workflow control</head><p><s>The workflow state is determined by the 'curation status' (Section 2.1.1),</s><s>the user action, and the error type (Section 2.1.2).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.1.">Curation status</head><p><s>The curation status (Figure <ref type="figure" target="#fig_0">1</ref>) is defined by type of action, manual or automatic, and status, which can assume the following values:</s></p><p><s>• new: default status when a new record is created.</s></p><p><s>• curated: the record has been amended manually.</s></p><p><s>• validated: the record was manually marked as valid.</s></p><p><s>• invalid: the record is wrong or inappropriate for the situation (e.g.</s><s>T m or T curie extracted as superconducting critical temperature).</s></p><p><s>• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ).</s><s>• removed: the record has been removed by a curator (internal status).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.2.">Error types</head><p><s>We first introduced error type in <ref type="bibr" target="#b0">[1]</ref> and extended their scope in this work to consider data curation and anomaly detection.</s><s>Users are required to select one Error Type at every record update or removal.</s><s>This information is stored in the 'original' record and can be different at every record modification.</s><s>The error type values can be summarised as follows:  • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).</s></p><p><s>• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper.</s><s>This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface.</s><s>• Curation amends: The curator is updating the data which does not present issues due to the automatic system.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Anomaly detection</head><p><s>Anomaly detection is the process of identifying unusual events or patterns in data.</s><s>In our context, this means identifying data that are greatly different from the expected values.</s><s>This post-process was introduced in a limited scope to draw attention to certain cases during the curation.</s></p><p><s>The anomaly detection uses a rule-based approach and marks any record that matches the following conditions</s></p><p><s>• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g.</s><s>'41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen <ref type="bibr" target="#b14">[15]</ref>, and text2chem <ref type="bibr" target="#b15">[16]</ref> • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.</s></p><p><s>Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification.</s><s>Since this process may find false positives, its output requires validation from curators.</s><s>For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.</s></p><p><s>We ran the anomaly detection on the full SuperCon 2 Database (40324 records <ref type="bibr" target="#b0">[1]</ref>).</s><s>The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values.</s><s>Further analysis and cross-references with contrasting information may be added in future.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Automatic training data collector</head><p><s>The curation process is a valuable endeavour demanding significant knowledge and human effort.</s><s>To maximise the use of this time for collecting as much information as possible.</s><s>We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.1.">Training data collection</head><p><s>In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information.</s><s>This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.2.">Training data management</head><p><s>We designed a specific page of the interface (Section 3) to manage the collected data (Figure <ref type="figure" target="#fig_1">2</ref>) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status.</s><s>The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them.</s><s>We integrated our interface with Labelstudio <ref type="bibr" target="#b16">[17]</ref> for the correction of the collected training examples.</s><s>Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Curation interface</head><p><s>The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure <ref type="figure" target="#fig_0">1</ref>).</s><s>It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure <ref type="figure" target="#fig_2">3</ref>).</s><s>The detailed schema, including examples, is reported in our previous work <ref type="bibr" target="#b0">[1]</ref>.</s></p><p><s>During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence).</s><s>Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure <ref type="figure" target="#fig_4">4</ref>).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Manual curation approach</head><p><s>In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.</s></p><p><s>We selected curators from domain experts in the field, to certify sufficient data quality.</s><s>Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result.</s><s>We followed two principles to guarantee robustness in the curation process.</s><s>First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in <ref type="bibr" target="#b17">[18]</ref>.</s><s>Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Curation guidelines</head><p><s>The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions.</s><s>The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).</s></p><p><s>Differently from our previous work <ref type="bibr" target="#b17">[18]</ref>, these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2.</s><s>Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure <ref type="figure" target="#fig_3">5</ref>.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Curation and processing logs</head><p><s>The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the .</s><s>Each row contains one potential training data example.</s><s>Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio <ref type="bibr" target="#b16">[17]</ref>.</s><s>The column 'status' indicate whether the example has been sent or not to the external tool.</s><s>curation process (curation log).</s><s>The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure <ref type="figure" target="#fig_6">6 top</ref>).</s><s>For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).</s></p><p><s>The curation log provides a view of what, when and how a record has been corrected (Figure <ref type="figure" target="#fig_6">6</ref> bottom).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Results and evaluation</head><p><s>In this section, we illustrate the experiments we have run to evaluate our work.</s><s>The evaluation is composed of three sets of results.</s><s>The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation.</s><s>Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file.</s><s>In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Anomaly detection rejection rate</head><p><s>We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation.</s><s>Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical  formulas.</s><s>Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure.</s><s>This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Training data generation</head><p><s>We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2).</s><s>Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models.</s><s>We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.</s></p><p><s>We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in <ref type="bibr" target="#b0">[1]</ref>.</s><s>We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations.</s><s>We use the DeLFT (Deep Learning For Text) <ref type="bibr" target="#b19">[20]</ref> library for training, evaluating, and managing the models for prediction.</s><s>A model can be trained with two different strategies:</s></p><p><s>(1) 'from scratch': when the model is initialised randomly.</s><s>We denote this strategy with an (s).</s><s>(2) 'incremental': when the initial model weights are taken from an already existing model.</s><s>We denote this strategy with an (i).</s></p><p><s>The latter can be seen as a way to 'continue' the training from a specific checkpoint.</s><s>We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting <ref type="bibr" target="#b20">[21]</ref> or overfitting.</s><s>The trained models are then tested using a fixed holdout dataset that we designed in our previous work <ref type="bibr" target="#b0">[1]</ref> and the evaluation scores are shown in Table <ref type="table">1</ref>.</s></p><p><s>This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table <ref type="table">2</ref>), we obtain an improvement of F1-score from 76.67% 2 to values between Table <ref type="table">1</ref>.</s><s>F1-score from the evaluation of the fine-tuned SciBERT models.</s><s>The training is performed with three different approaches.</s><s>The base dataset is the original dataset described in <ref type="bibr" target="#b17">[18]</ref>, and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.</s><s>s indicate 'training from scratch', while i indicate 'incremental training'.</s><s>The evaluation is performed using the same holdout dataset from SuperMat <ref type="bibr" target="#b17">[18]</ref>.</s><s>The results are averaged over five runs or train and evaluation.</s><s>77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively.</s><s>This experiment gives interesting insight relative to the positive impact on the way we select the training data.</s><s>However, there are some limitations: the curation dataset is small compared to the base dataset.</s><s>This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores.</s><s>A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Data quality</head><p><s>We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).</s></p><p><s>We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS).</s><s>Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method.</s><s>Overall, each pair of curators had five papers in common which they had to process using opposite methods.</s><s>For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method.</s><s>After curation, a fourth individual manually reviewed the curated content.</s><s>The raw data is available in Tables <ref type="table">A1</ref> and<ref type="table" target="#tab_6">A2</ref>.</s></p><p><s>We evaluated the curation considering a double perspective: time and correctness.</s><s>Time was calculated as the accumulated minutes required using each method.</s><s>Correctness was assessed using standard measures such as precision, recall, and the F1-score.</s><s>Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information.</s><s>F1-Score is a harmonic means of precision and recall.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.1.">Discussion</head><p><s>Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method.</s><s>When the experiment was carried out, not all the curators were familiar with the interface method.</s><s>Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.</s></p><p><s>We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table <ref type="table" target="#tab_4">3</ref>).</s><s>The F1-score improved by 39.35%.</s></p><p><s>The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills.</s><s>Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table <ref type="table" target="#tab_3">4</ref>).</s><s>Furthermore, we observed a modest improvement between master's students and PhD students.</s><s>These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice.</s><s>Thus, using only a few senior researchers for the second round of validation (Section 3.1).</s></p><p><s>Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table <ref type="table" target="#tab_5">5</ref>.</s></p><p><s>The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Code availability</head><p><s>This work is available at<ref type="url" target="https://github.com/lfoppiano/supercon2">https://github.com/lfoppiano/ supercon2</ref>.</s><s>The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table <ref type="table">2</ref>. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models.</s><s>The base dataset is the original dataset described in <ref type="bibr" target="#b17">[18]</ref>, and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusions</head><p><s>We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database <ref type="bibr" target="#b0">[1]</ref>) before they are ingested into the existing, manually-build database of superconductors, SuperCon <ref type="bibr" target="#b7">[8]</ref>.</s><s>The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer.</s><s>Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model.</s><s>Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively.</s><s>In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger <ref type="bibr" target="#b21">[22]</ref> dataset.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Notes</head><p><s>1. 'internal status' indicates that their records should be hidden in the interface.</s><s>2. In our previous work <ref type="bibr" target="#b0">[1]</ref> we reported 77.03% F1score.</s><s>There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0.</s><s>One cause may be the use of different hyperparameters in version 0.3.0</s><s>such as batch size and learning rate.</s><s>However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues<ref type="url" target="https://github.com/kermitt2/delft/issues/150">https://github.com/kermitt2/delft/issues/150</ref>.</s></p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 .</head><label>1</label><figDesc><div><p><s>Figure 1.</s><s>Schema of the curation workflow.</s><s>Each node has two properties: type and status (Section 2.1.1).</s><s>Each edge indicates one action.</s><s>The workflow starts on the left side of the figure.</s><s>The new records begin with 'automatic, new'.</s><s>Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node.</s><s>Each combination of property values identifies each state.</s><s>'(*)' indicates a transition for which the training data are collected (Section 2.3).</s></p></div></figDesc><graphic coords="4,85.04,52.87,425.16,296.76" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 .</head><label>2</label><figDesc><div><p><s>Figure 2. Screenshot of the training data management page in the SuperCon 2 interface.</s><s>Each row contains one potential training data example.</s><s>Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio<ref type="bibr" target="#b16">[17]</ref>.</s><s>The column 'status' indicate whether the example has been sent or not to the external tool.</s></p></div></figDesc><graphic coords="6,60.94,52.85,473.40,190.08" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 3 .</head><label>3</label><figDesc><div><p><s>Figure 3. Screenshot of SuperCon 2 interface showing the database.</s><s>Each row corresponds to one material-T c pair.</s><s>On top, there are searches by attribute, sorting and other filtering operations.</s><s>On the right there are curation controls (mark as valid, update, etc.).</s><s>Records are grouped by document with alternating light yellow and white.</s></p></div></figDesc><graphic coords="6,60.94,311.02,473.40,269.40" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 5 .</head><label>5</label><figDesc><div><p><s>Figure 5. Sample curation sheet from the curation guidelines.</s><s>The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams.</s><s>(c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.</s></p></div></figDesc><graphic coords="7,94.28,255.74,406.68,223.20" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 4 .</head><label>4</label><figDesc><div><p><s>Figure 4. PDF document viewer showing an annotated document.</s><s>The table on top is linked through the annotated entities.</s><s>The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.</s></p></div></figDesc><graphic coords="7,60.94,52.87,473.40,133.08" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>( 1 )</head><label>1</label><figDesc><div><p><s>base(s): using the base dataset and training from scratch (s).</s><s>(2) (base+curation)(s): using both the base and curation datasets and training from scratch (s).</s><s>(3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).</s></p></div></figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6"><head>Figure 6 .</head><label>6</label><figDesc><div><p><s>Figure 6.</s><s>Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred.</s><s>Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates.</s><s>By clicking on the 'record id', is possible to visualise the latest record values.</s></p></div></figDesc><graphic coords="8,60.94,52.86,473.40,195.00" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0"><head/><label/><figDesc><div><p/></div></figDesc><graphic coords="2,100.86,391.72,297.60,233.04" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>c classification: The temperature is not correctly classified</head><label/><figDesc><div><p><s>The material is incorrectly linked to the T c given that the entities are correctly recognised.</s></p></div></figDesc><table><row><cell>• From table: the entities Material ! T c !</cell></row><row><cell>Pressure are identified in a table. At the moment,</cell></row><row><cell>table extraction is not performed</cell></row><row><cell>• Extraction: The material, temperature, and pressure</cell></row><row><cell>are not extracted (no box) or extracted incorrectly.</cell></row></table><note><p><s>• Linking: • T as 'superconductors critical temperature' (e.g.</s><s>Curie temperature, Magnetic temperature. .</s><s>.).</s></p></note></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 4 .</head><label>4</label><figDesc><div><p><s>Evaluation</s></p></div></figDesc><table><row><cell/><cell>base</cell><cell>base+curation</cell><cell>Δ</cell></row><row><cell>&lt;class&gt;</cell><cell>1646</cell><cell>1732</cell><cell>86</cell></row><row><cell>&lt;material&gt;</cell><cell>6943</cell><cell>7580</cell><cell>637</cell></row><row><cell>&lt;me_method&gt;</cell><cell>1883</cell><cell>1934</cell><cell>51</cell></row><row><cell>&lt;pressure&gt;</cell><cell>274</cell><cell>361</cell><cell>87</cell></row><row><cell>&lt;tc&gt;</cell><cell>3741</cell><cell>4269</cell><cell>528</cell></row><row><cell>&lt;tcValue&gt;</cell><cell>1099</cell><cell>1556</cell><cell>457</cell></row><row><cell>Total</cell><cell>15586</cell><cell>17432</cell><cell>1846</cell></row></table><note><p><s>scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher).</s><s>Each person corrected 10 documents.</s></p></note></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 3 .</head><label>3</label><figDesc><div><p><s>Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).</s></p></div></figDesc><table><row><cell>Method</cell><cell>P (%)</cell><cell>R (%)</cell><cell>F1%)</cell><cell># docs</cell></row><row><cell>PDF document</cell><cell>87.83</cell><cell>45.61</cell><cell>52.67</cell><cell>15</cell></row><row><cell>Interface</cell><cell>93.38</cell><cell>92.51</cell><cell>92.02</cell><cell>15</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5"><head>Table 5 .</head><label>5</label><figDesc><div><p><s>Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).</s></p></div></figDesc><table><row><cell>Experience</cell><cell>Method</cell><cell>P (%)</cell><cell>R (%)</cell><cell>F1%)</cell><cell># docs</cell><cell># pages</cell></row><row><cell>MS</cell><cell>PDF Document</cell><cell>94.58</cell><cell>36.55</cell><cell>48.67</cell><cell>6</cell><cell>46</cell></row><row><cell/><cell>Interface</cell><cell>83.19</cell><cell>95.83</cell><cell>88.25</cell><cell>4</cell><cell>50</cell></row><row><cell>PD</cell><cell>PDF Document</cell><cell>70.00</cell><cell>48.51</cell><cell>50.78</cell><cell>5</cell><cell>49</cell></row><row><cell/><cell>Interface</cell><cell>96.67</cell><cell>82.86</cell><cell>88.11</cell><cell>5</cell><cell>51</cell></row><row><cell>SR</cell><cell>PDF Document</cell><cell>100.00</cell><cell>55.56</cell><cell>61.03</cell><cell>4</cell><cell>51</cell></row><row><cell/><cell>Interface</cell><cell>97.42</cell><cell>98.33</cell><cell>97.78</cell><cell>6</cell><cell>45</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_6"><head>Table A2 .</head><label>A2</label><figDesc><div><p><s>Evaluation scores obtained for each document and method (I: interface, P: PDF) combination.</s><s>TP: true positive, FP: false positive, FN: false negative.</s><s>P: precision, R: recall, F1: F1-score.</s></p></div></figDesc><table><row><cell>Document ID</cell><cell># pages</cell><cell>Method</cell><cell># TP</cell><cell># FP</cell><cell># FN</cell><cell>P</cell><cell>R</cell><cell>F1</cell></row><row><cell>Senior Researcher (SR)</cell><cell/><cell/><cell/><cell/><cell/><cell/><cell/><cell/></row><row><cell>0454e07f64</cell><cell>4</cell><cell>I</cell><cell>6</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>00c32076f4</cell><cell>13</cell><cell>P</cell><cell>8</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0c7d3163ea</cell><cell>9</cell><cell>I</cell><cell>13</cell><cell>1</cell><cell>0</cell><cell>92.86</cell><cell>100.00</cell><cell>96.30</cell></row><row><cell>0da5febabf</cell><cell>11</cell><cell>P</cell><cell>8</cell><cell>0</cell><cell>1</cell><cell>100.00</cell><cell>88.89</cell><cell>94.12</cell></row><row><cell>0012333581</cell><cell>13</cell><cell>I</cell><cell>11</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0aa1b3161f</cell><cell>5</cell><cell>I</cell><cell>9</cell><cell>0</cell><cell>1</cell><cell>100.00</cell><cell>90.00</cell><cell>94.74</cell></row><row><cell>0021fd339f</cell><cell>14</cell><cell>P</cell><cell>4</cell><cell>0</cell><cell>8</cell><cell>100.00</cell><cell>33.33</cell><cell>50.00</cell></row><row><cell>039105663f</cell><cell>9</cell><cell>I</cell><cell>11</cell><cell>1</cell><cell>0</cell><cell>91.67</cell><cell>100.00</cell><cell>95.65</cell></row><row><cell>02c4f00127</cell><cell>13</cell><cell>P</cell><cell>0</cell><cell>0</cell><cell>3</cell><cell>100.00</cell><cell>0.00</cell><cell>0.00</cell></row><row><cell>021c413172</cell><cell>5</cell><cell>I</cell><cell>15</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>PhD Student (PS)</cell><cell/><cell/><cell/><cell/><cell/><cell/><cell/><cell/></row><row><cell>02bf1b3db9</cell><cell>7</cell><cell>I</cell><cell>5</cell><cell>0</cell><cell>2</cell><cell>100.00</cell><cell>71.43</cell><cell>83.33</cell></row><row><cell>00b50fc0a8</cell><cell>11</cell><cell>P</cell><cell>2</cell><cell>0</cell><cell>7</cell><cell>100.00</cell><cell>22.22</cell><cell>36.36</cell></row><row><cell>02cbc58819</cell><cell>4</cell><cell>I</cell><cell>4</cell><cell>0</cell><cell>3</cell><cell>100.00</cell><cell>57.14</cell><cell>72.73</cell></row><row><cell>044939701d</cell><cell>12</cell><cell>P</cell><cell>4</cell><cell>0</cell><cell>2</cell><cell>100.00</cell><cell>66.67</cell><cell>80.00</cell></row><row><cell>08e1cb8f4f</cell><cell>16</cell><cell>I</cell><cell>5</cell><cell>1</cell><cell>1</cell><cell>83.33</cell><cell>85.71</cell><cell>84.51</cell></row><row><cell>0454e07f64</cell><cell>4</cell><cell>P</cell><cell>0</cell><cell>1</cell><cell>5</cell><cell>0.00</cell><cell>16.67</cell><cell>0.00</cell></row><row><cell>00c32076f4</cell><cell>13</cell><cell>I</cell><cell>8</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0c7d3163ea</cell><cell>9</cell><cell>P</cell><cell>9</cell><cell>0</cell><cell>5</cell><cell>100.00</cell><cell>64.29</cell><cell>78.26</cell></row><row><cell>0da5febabf</cell><cell>11</cell><cell>I</cell><cell>9</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0012333581</cell><cell>13</cell><cell>P</cell><cell>4</cell><cell>4</cell><cell>3</cell><cell>50.00</cell><cell>72.73</cell><cell>59.26</cell></row><row><cell>Master Student (MS)</cell><cell/><cell/><cell/><cell/><cell/><cell/><cell/><cell/></row><row><cell>0aa1b3161f</cell><cell>5</cell><cell>P</cell><cell>1</cell><cell>0</cell><cell>9</cell><cell>100.00</cell><cell>10.00</cell><cell>18.18</cell></row><row><cell>0021fd339f</cell><cell>14</cell><cell>I</cell><cell>12</cell><cell>3</cell><cell>3</cell><cell>80.00</cell><cell>100.00</cell><cell>88.89</cell></row><row><cell>039105663f</cell><cell>9</cell><cell>P</cell><cell>4</cell><cell>1</cell><cell>7</cell><cell>80.00</cell><cell>41.67</cell><cell>54.79</cell></row><row><cell>02c4f00127</cell><cell>13</cell><cell>I</cell><cell>3</cell><cell>1</cell><cell>1</cell><cell>75.00</cell><cell>100.00</cell><cell>85.71</cell></row><row><cell>021c413172</cell><cell>5</cell><cell>P</cell><cell>7</cell><cell>1</cell><cell>7</cell><cell>87.50</cell><cell>53.33</cell><cell>66.27</cell></row><row><cell>02bf1b3db9</cell><cell>7</cell><cell>P</cell><cell>2</cell><cell>0</cell><cell>5</cell><cell>100.00</cell><cell>28.57</cell><cell>44.44</cell></row><row><cell>00b50fc0a8</cell><cell>11</cell><cell>I</cell><cell>7</cell><cell>2</cell><cell>0</cell><cell>77.78</cell><cell>100.00</cell><cell>87.50</cell></row><row><cell>02cbc58819</cell><cell>4</cell><cell>P</cell><cell>5</cell><cell>0</cell><cell>2</cell><cell>100.00</cell><cell>71.43</cell><cell>83.33</cell></row><row><cell>044939701d</cell><cell>12</cell><cell>I</cell><cell>5</cell><cell>0</cell><cell>1</cell><cell>100.00</cell><cell>83.33</cell><cell>90.91</cell></row><row><cell>08e1cb8f4f</cell><cell>16</cell><cell>P</cell><cell>1</cell><cell>0</cell><cell>6</cell><cell>100.00</cell><cell>14.29</cell><cell>25.00</cell></row></table></figure>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_0"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 2 L. FOPPIANO et al.</s></p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_1"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 3 L. FOPPIANO et al.</s></p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_2"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 5 L. FOPPIANO et al.</s></p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_3"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 6 L. FOPPIANO et al.</s></p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_4"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 9L.</s><s>FOPPIANO et al.</s></p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_5"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 10 L. FOPPIANO et al.</s></p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_6"><p><s>Sci. Technol.</s><s>Adv.</s><s>Mater.</s><s>Meth. 3 (2023) 12 L. FOPPIANO et al.</s></p></note>
+		</body>
+		<back>
+
+			<div type="acknowledgement">
+<div><head>Acknowledgements</head><p>Our warmest thanks to <rs type="person">Patrice Lopez</rs>, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank <rs type="person">Pedro Baptista de Castro</rs> for his support during this work.Special thanks to <rs type="person">Erina Fujita</rs> for useful tips on the manuscript.</p></div>
+			</div>
+			<div type="funding">
+<div><p>Materials Modelling Group, Data-driven <rs type="affiliation">Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba</rs>, Ibaraki <rs type="grantNumber">305-0044</rs>, Japan</p></div>
+			</div>
+			<div type="funding">
+<div><head>Funding</head><p>This work was partly supported by <rs type="funder">MEXT</rs> Program: Data Creation and <rs type="funder">Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)</rs> Grant Number [JPMXP1122715503].</p></div>
+			</div>
+			<listOrg type="funding">
+				<org type="funding" xml:id="_zeDNERp">
+					<idno type="grant-number">305-0044</idno>
+				</org>
+			</listOrg>
+			<div type="annex">
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Disclosure statement</head><p><s>No potential conflict of interest was reported by the author(s).</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Author contribution</head><p><s>LF wrote the manuscript and KT helped with the editing.</s><s>LF and POS discussed the ML results and experiments.</s><s>LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface.</s><s>LF designed the user interface experiment with KT, TT and WS as curators.</s><s>KT led the materials-science work on the data with CS, TT and WS.</s><s>KT, TA, YT and MI revised the paper.</s><s>YT and MI supervised the work of the respective teams.</s></p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Appendix A. Evaluation</head><p><s>Table <ref type="table">A1</ref>.</s><s>Timetable recording the time spent for each of the 15 articles.</s><s>Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR).</s><s>Duration is expressed in minutes.</s></p></div>			</div>
+			<div type="references">
+
+				<listBibl>
+
+<biblStruct xml:id="b0">
+	<analytic>
+		<title level="a" type="main">Automatic extraction of materials and properties from superconductors scientific literature</title>
+		<author>
+			<persName><forename type="first">L</forename><surname>Foppiano</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><forename type="middle">B</forename><surname>Castro</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><forename type="middle">O</forename><surname>Suarez</surname></persName>
+		</author>
+		<idno type="DOI">10.1080/27660400.2022.2153633</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Sci Technol Adv Mater</title>
+		<imprint>
+			<biblScope unit="volume">3</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">2153633</biblScope>
+			<date type="published" when="2023">2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+	<analytic>
+		<title level="a" type="main">Materials discovery with machine learning and knowledge discovery</title>
+		<author>
+			<persName><forename type="first">O</forename><forename type="middle">N</forename><surname>Oliveira</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><forename type="middle">J</forename><surname>Oliveira</surname></persName>
+		</author>
+		<idno type="DOI">10.3389/fchem.2022.930369</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Front Chem</title>
+		<imprint>
+			<biblScope unit="volume">10</biblScope>
+			<biblScope unit="page">10</biblScope>
+			<date type="published" when="2022">2022</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+	<analytic>
+		<title level="a" type="main">Commentary: the materials project: a materials genome approach to accelerating materials innovation</title>
+		<author>
+			<persName><forename type="first">A</forename><surname>Jain</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><forename type="middle">P</forename><surname>Ong</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">G</forename><surname>Hautier</surname></persName>
+		</author>
+		<idno type="DOI">10.1063/1.4812323</idno>
+	</analytic>
+	<monogr>
+		<title level="j">APL Mater</title>
+		<imprint>
+			<biblScope unit="volume">1</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">11002</biblScope>
+			<date type="published" when="2013">2013</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+	<analytic>
+		<title level="a" type="main">Aflow: an automatic framework for high-throughput materials discovery</title>
+		<author>
+			<persName><forename type="first">S</forename><surname>Curtarolo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">W</forename><surname>Setyawan</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">G</forename><forename type="middle">L</forename><surname>Hart</surname></persName>
+		</author>
+		<ptr target="https://www.sciencedirect.com/science/article/pii/S0927025612000717"/>
+	</analytic>
+	<monogr>
+		<title level="j">Comput Mater Sci</title>
+		<imprint>
+			<biblScope unit="volume">58</biblScope>
+			<biblScope unit="page" from="218" to="226"/>
+			<date type="published" when="2012">2012</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+	<analytic>
+		<title level="a" type="main">The nomad laboratory: from data sharing to artificial intelligence</title>
+		<author>
+			<persName><forename type="first">C</forename><surname>Draxl</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Scheffler</surname></persName>
+		</author>
+		<idno type="DOI">10.1088/2515-7639/ab13bb</idno>
+	</analytic>
+	<monogr>
+		<title level="j">J Phys Mater</title>
+		<imprint>
+			<biblScope unit="volume">2</biblScope>
+			<biblScope unit="issue">3</biblScope>
+			<biblScope unit="page">36001</biblScope>
+			<date type="published" when="2019">2019</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+	<analytic>
+		<title level="a" type="main">Global publication productivity in materials science research: a scientometric analysis</title>
+		<author>
+			<persName><forename type="first">T</forename><surname>Pratheepan</surname></persName>
+		</author>
+		<ptr target="https://ojs.trp.org.in/index.php/ijiss/article/view/583"/>
+	</analytic>
+	<monogr>
+		<title level="j">Indian J Inf Sources Serv</title>
+		<imprint>
+			<biblScope unit="volume">9</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page" from="111" to="116"/>
+			<date type="published" when="2019-02">2019 Feb</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+	<monogr>
+		<title level="m" type="main">The PAULING FILE project and materials platform for data science: from big data toward materials genome</title>
+		<author>
+			<persName><forename type="first">E</forename><surname>Blokhin</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Villars</surname></persName>
+		</author>
+		<idno type="DOI">10.1007/978-3-319-42913-7_62-1</idno>
+		<imprint>
+			<date type="published" when="2018">2018</date>
+			<publisher>Springer International Publishing</publisher>
+			<biblScope unit="page" from="1" to="26"/>
+			<pubPlace>Cham</pubPlace>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+	<analytic>
+		<title level="a" type="main">Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases</title>
+		<author>
+			<persName><forename type="first">M</forename><surname>Ishii</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><surname>Sakamoto</surname></persName>
+		</author>
+		<idno type="DOI">10.1080/27660400.2023.2223051</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Sci Technol Adv Mater</title>
+		<imprint>
+			<biblScope unit="volume">3</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">2223051</biblScope>
+			<date type="published" when="2023">2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+	<analytic>
+		<title level="a" type="main">Predicting new superconductors and their critical temperatures using machine learning</title>
+		<author>
+			<persName><forename type="first">B</forename><surname>Roter</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Dordevic</surname></persName>
+		</author>
+		<idno type="DOI">10.1016/j.physc.2020.1353689</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Phys C</title>
+		<imprint>
+			<biblScope unit="volume">575</biblScope>
+			<biblScope unit="page">1353689</biblScope>
+			<date type="published" when="2020">2020</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+	<analytic>
+		<title level="a" type="main">Machine learning modeling of superconducting critical temperature</title>
+		<author>
+			<persName><forename type="first">V</forename><surname>Stanev</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><surname>Oses</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Kusne</surname></persName>
+		</author>
+		<idno type="DOI">10.1038/s41524-018-0085-8</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Npj Comput Mater</title>
+		<imprint>
+			<biblScope unit="volume">4</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">4</biblScope>
+			<date type="published" when="2017">2017</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+	<monogr>
+		<title level="m" type="main">Machine-learning approach for discovery of conventional superconductors</title>
+		<author>
+			<persName><forename type="first">H</forename><surname>Tran</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><forename type="middle">N</forename><surname>Vu</surname></persName>
+		</author>
+		<idno>arXiv:221103265. 2022</idno>
+		<imprint/>
+	</monogr>
+	<note type="report_type">arXiv preprint</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+	<analytic>
+		<title level="a" type="main">Deep learning model for finding new superconductors</title>
+		<author>
+			<persName><forename type="first">T</forename><surname>Konno</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">H</forename><surname>Kurokawa</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">F</forename><surname>Nabeshima</surname></persName>
+		</author>
+		<idno type="DOI">10.1103/PhysRevB.103.014509</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Phys Rev B</title>
+		<imprint>
+			<biblScope unit="volume">103</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">14509</biblScope>
+			<date type="published" when="2021">2021</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+	<analytic>
+		<title level="a" type="main">The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation</title>
+		<author>
+			<persName><forename type="first">J</forename><forename type="middle">C</forename><surname>Klie</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Bugert</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">B</forename><surname>Boullosa</surname></persName>
+		</author>
+		<ptr target="https://www.aclweb.org/anthology/C18-2002"/>
+	</analytic>
+	<monogr>
+		<title level="m">Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations</title>
+		<meeting>the 27th International Conference on Computational Linguistics: System Demonstrations<address><addrLine>Santa Fe, New Mexico</addrLine></address></meeting>
+		<imprint>
+			<date type="published" when="2018">2018</date>
+			<biblScope unit="page" from="5" to="9"/>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+	<analytic>
+		<title level="a" type="main">Doccano: text annotation tool for human</title>
+		<author>
+			<persName><forename type="first">H</forename><surname>Nakayama</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><surname>Kubo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">J</forename><surname>Kamura</surname></persName>
+		</author>
+		<ptr target="https://github.com/doccano/doccano"/>
+	</analytic>
+	<monogr>
+		<title level="j">Software</title>
+		<imprint>
+			<date type="published" when="2018">2018</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+	<analytic>
+		<title level="a" type="main">Python materials genomics pymatgen: a robust open-source python library for materials analysis</title>
+		<author>
+			<persName><forename type="first">S</forename><forename type="middle">P</forename><surname>Ong</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">W</forename><forename type="middle">D</forename><surname>Richards</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Jain</surname></persName>
+		</author>
+		<idno type="DOI">10.1016/j.commatsci.2012.10.028</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Comput Mater Sci</title>
+		<imprint>
+			<biblScope unit="volume">68</biblScope>
+			<biblScope unit="issue">2</biblScope>
+			<biblScope unit="page" from="314" to="319"/>
+			<date type="published" when="2013">2013</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+	<monogr>
+		<title level="m" type="main">Text-mined dataset of inorganic materials synthesis recipes. Sci Data</title>
+		<author>
+			<persName><forename type="first">O</forename><surname>Kononova</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">H</forename><surname>Huo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><surname>He</surname></persName>
+		</author>
+		<idno type="DOI">10.1038/s41597-019-0224-1</idno>
+		<idno>41597-019-0224-1</idno>
+		<ptr target="https://doi.org/10.1038/s"/>
+		<imprint>
+			<date type="published" when="2019-10">2019 Oct</date>
+			<biblScope unit="volume">6</biblScope>
+			<biblScope unit="page">203</biblScope>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+	<analytic>
+		<title level="a" type="main">Label studio: data labeling software; 2020-2022</title>
+		<author>
+			<persName><forename type="first">M</forename><surname>Tkachenko</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Malyuk</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Holmanyuk</surname></persName>
+		</author>
+		<ptr target="https://github.com/heartexlabs/label-studio"/>
+	</analytic>
+	<monogr>
+		<title level="m">Open source software</title>
+		<imprint/>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+	<analytic>
+		<title level="a" type="main">Supermat: construction of a linked annotated dataset from superconductors-related publications</title>
+		<author>
+			<persName><forename type="first">L</forename><surname>Foppiano</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Dieb</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Suzuki</surname></persName>
+		</author>
+		<idno type="DOI">10.1080/27660400.2021.1918396</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Sci Technol Adv Mater: Methods</title>
+		<imprint>
+			<biblScope unit="volume">1</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page" from="34" to="44"/>
+			<date type="published" when="2021">2021</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+	<analytic>
+		<title level="a" type="main">SciBERT: a pretrained language model for scientific text</title>
+		<author>
+			<persName><forename type="first">I</forename><surname>Beltagy</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><surname>Lo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Cohan</surname></persName>
+		</author>
+		<ptr target="https://aclanthology.org/D19-1371"/>
+	</analytic>
+	<monogr>
+		<title level="m">Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</title>
+		<meeting>the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing<address><addrLine>Hong Kong; China</addrLine></address></meeting>
+		<imprint>
+			<publisher>Association for Computational Linguistics</publisher>
+			<date type="published" when="2019-11">Nov. 2019</date>
+			<biblScope unit="page" from="3615" to="3620"/>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+	<analytic>
+		<title/>
+		<ptr target="https://github.com/kermitt2/delft"/>
+	</analytic>
+	<monogr>
+		<title level="j">DeLFT contributors. Delft</title>
+		<imprint>
+			<date type="published" when="2018">2018-2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+	<analytic>
+		<title level="a" type="main">Overcoming catastrophic forgetting in neural networks</title>
+		<author>
+			<persName><forename type="first">J</forename><surname>Kirkpatrick</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">R</forename><surname>Pascanu</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">N</forename><forename type="middle">C</forename><surname>Rabinowitz</surname></persName>
+		</author>
+		<idno>abs/1612.00796</idno>
+		<ptr target="http://arxiv.org/abs/1612.00796"/>
+	</analytic>
+	<monogr>
+		<title level="j">CoRr</title>
+		<imprint>
+			<date type="published" when="2016">2016</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+	<monogr>
+		<title/>
+		<author>
+			<persName><forename type="first">G</forename><surname>Contributors</surname></persName>
+		</author>
+		<author>
+			<persName><surname>Grobid</surname></persName>
+		</author>
+		<ptr target="https://github.com/kermitt2/grobid"/>
+		<imprint>
+			<date type="published" when="2008">2008 -2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+				</listBibl>
+			</div>
+		</back>
+	</text>
+</TEI>
\ No newline at end of file
diff --git a/src/test/resources/org/pub2tei/document/document2.tei.xml b/src/test/resources/org/pub2tei/document/document2.tei.xml
new file mode 100644
index 0000000..2ab3daa
--- /dev/null
+++ b/src/test/resources/org/pub2tei/document/document2.tei.xml
@@ -0,0 +1,760 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink">
+	<teiHeader xml:lang="en">
+		<fileDesc>
+			<titleStmt>
+				<title level="a" type="main">Science and Technology of Advanced Materials: Methods</title>
+				<funder>
+					<orgName type="full">MEXT</orgName>
+				</funder>
+				<funder ref="#_thsdDye">
+					<orgName type="full">unknown</orgName>
+				</funder>
+				<funder>
+					<orgName type="full">Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)</orgName>
+				</funder>
+			</titleStmt>
+			<publicationStmt>
+				<publisher/>
+				<availability status="unknown">
+					<licence/>
+				</availability>
+				<date type="published" when="2023-12-14">14 Dec 2023.</date>
+			</publicationStmt>
+			<sourceDesc>
+				<biblStruct>
+					<analytic>
+						<author>
+							<persName><forename type="first">Luca</forename><surname>Foppiano</surname></persName>
+							<idno type="ORCID">0000-0002-6114-6164</idno>
+							<affiliation key="aff0">
+								<orgName type="department">Materials Modelling Group</orgName>
+								<orgName type="laboratory">Centre for Basic Research on Materials</orgName>
+								<orgName type="institution" key="instit1">Data-driven Materials Research Field</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+							<affiliation key="aff1">
+								<orgName type="department" key="dep1">Knowledge and Data Engineering</orgName>
+								<orgName type="department" key="dep2">Centre for Computational Sciences</orgName>
+								<orgName type="institution">University of Tsukuba</orgName>
+								<address>
+									<settlement>Tsukuba</settlement>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Tomoya</forename><surname>Mato</surname></persName>
+							<idno type="ORCID">0000-0002-0918-6468</idno>
+							<affiliation key="aff0">
+								<orgName type="department">Materials Modelling Group</orgName>
+								<orgName type="laboratory">Centre for Basic Research on Materials</orgName>
+								<orgName type="institution" key="instit1">Data-driven Materials Research Field</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Kensei</forename><surname>Terashima</surname></persName>
+							<idno type="ORCID">0000-0003-0375-3043</idno>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Pedro</forename><surname>Ortiz Suarez</surname></persName>
+							<idno type="ORCID">0000-0003-0343-8852</idno>
+							<affiliation key="aff3">
+								<orgName type="department">GmbH DFKI</orgName>
+								<orgName type="institution">CONTACT Luca Foppiano</orgName>
+								<address>
+									<addrLine>Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii</addrLine>
+									<postCode>0000-0003-0375</postCode>
+									<settlement>Berlin</settlement>
+									<region>DE</region>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Taku</forename><surname>Tou</surname></persName>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Chikako</forename><surname>Sakai</surname></persName>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Wei-Sheng</forename><surname>Wang</surname></persName>
+							<idno type="ORCID">0009-0001-3572-5736</idno>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Toshiyuki</forename><surname>Amagasa</surname></persName>
+							<idno type="ORCID">0000-0003-0595-2230</idno>
+							<affiliation key="aff1">
+								<orgName type="department" key="dep1">Knowledge and Data Engineering</orgName>
+								<orgName type="department" key="dep2">Centre for Computational Sciences</orgName>
+								<orgName type="institution">University of Tsukuba</orgName>
+								<address>
+									<settlement>Tsukuba</settlement>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><forename type="first">Yoshihiko</forename><surname>Takano</surname></persName>
+							<idno type="ORCID">0000-0002-1541-6928</idno>
+							<affiliation key="aff2">
+								<orgName type="department">Frontier Superconducting Materials Group</orgName>
+								<orgName type="institution" key="instit1">MANA</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author role="corresp">
+							<persName><forename type="first">Masashi</forename><surname>Ishii</surname></persName>
+							<email>ishii.masashi@nims.go.jp</email>
+							<idno type="ORCID">0000-0003-0357-2832</idno>
+							<affiliation key="aff0">
+								<orgName type="department">Materials Modelling Group</orgName>
+								<orgName type="laboratory">Centre for Basic Research on Materials</orgName>
+								<orgName type="institution" key="instit1">Data-driven Materials Research Field</orgName>
+								<orgName type="institution" key="instit2">NIMS</orgName>
+								<address>
+									<region>Tsukuba</region>
+									<country>Japan;</country>
+								</address>
+							</affiliation>
+						</author>
+						<author>
+							<persName><surname>Masashi</surname></persName>
+						</author>
+						<title level="a" type="main">Science and Technology of Advanced Materials: Methods</title>
+					</analytic>
+					<monogr>
+						<idno type="ISSN">Print</idno>
+						<imprint>
+							<date type="published" when="2023-12-14">14 Dec 2023.</date>
+						</imprint>
+					</monogr>
+					<idno type="MD5">DCB0425EE18794E34CC3A3075E3E3975</idno>
+					<idno type="DOI">10.1080/27660400.2023.2286219</idno>
+					<note type="submission">Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023</note>
+				</biblStruct>
+			</sourceDesc>
+		</fileDesc>
+		<encodingDesc>
+			<appInfo>
+				<application version="project.version" ident="GROBID" when="2024-04-26T11:25+0000">
+					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
+					<ref target="https://github.com/kermitt2/grobid"/>
+				</application>
+			</appInfo>
+		</encodingDesc>
+		<profileDesc>
+			<textClass>
+				<keywords>
+					<term>Materials informatics</term>
+					<term>superconductors</term>
+					<term>machine learning</term>
+					<term>database</term>
+					<term>TDM</term>
+				</keywords>
+			</textClass>
+			<abstract>
+<div xmlns="http://www.tei-c.org/ns/1.0"><p>We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>IMPACT STATEMENT</head><p>This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.</p></div>
+			</abstract>
+		</profileDesc>
+	</teiHeader>
+	<text xml:lang="en">
+		<body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project <ref type="bibr" target="#b2">[3]</ref>, JARVIS (Joint Automated Repository for Various Integrated Simulations) <ref type="bibr" target="#b3">[4]</ref>, NOMAD (Novel Materials Discovery) <ref type="bibr" target="#b4">[5]</ref>, that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.</p><p>On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science <ref type="bibr" target="#b5">[6]</ref>. Currently, only a few limited resources exist, such as the Pauling File <ref type="bibr" target="#b6">[7]</ref> and SuperCon <ref type="bibr" target="#b7">[8]</ref>, necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.</p><p>The SuperCon database was built manually from 1987 <ref type="bibr" target="#b7">[8]</ref> by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors <ref type="bibr" target="#b8">[9]</ref><ref type="bibr" target="#b9">[10]</ref><ref type="bibr" target="#b10">[11]</ref><ref type="bibr" target="#b11">[12]</ref>. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days <ref type="bibr" target="#b0">[1]</ref>. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception <ref type="bibr" target="#b12">[13]</ref>, and Doccano <ref type="bibr" target="#b13">[14]</ref> which concentrate on text labelling and classification.</p><p>In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.</p><p>Our contributions can be summarised as follows:</p><p>• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Curation workflow</head><p>The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure <ref type="figure" target="#fig_0">1</ref>). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.</p><p>Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).</p><p>Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Workflow control</head><p>The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.1.">Curation status</head><p>The curation status (Figure <ref type="figure" target="#fig_0">1</ref>) is defined by type of action, manual or automatic, and status, which can assume the following values:</p><p>• new: default status when a new record is created.</p><p>• curated: the record has been amended manually.</p><p>• validated: the record was manually marked as valid.</p><p>• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).</p><p>• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.2.">Error types</head><p>We first introduced error type in <ref type="bibr" target="#b0">[1]</ref> and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows:  • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).</p><p>• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Anomaly detection</head><p>Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.</p><p>The anomaly detection uses a rule-based approach and marks any record that matches the following conditions</p><p>• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen <ref type="bibr" target="#b14">[15]</ref>, and text2chem <ref type="bibr" target="#b15">[16]</ref> • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.</p><p>Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.</p><p>We ran the anomaly detection on the full SuperCon 2 Database (40324 records <ref type="bibr" target="#b0">[1]</ref>). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Automatic training data collector</head><p>The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.1.">Training data collection</head><p>In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.2.">Training data management</head><p>We designed a specific page of the interface (Section 3) to manage the collected data (Figure <ref type="figure" target="#fig_1">2</ref>) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio <ref type="bibr" target="#b16">[17]</ref> for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Curation interface</head><p>The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure <ref type="figure" target="#fig_0">1</ref>). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure <ref type="figure" target="#fig_2">3</ref>). The detailed schema, including examples, is reported in our previous work <ref type="bibr" target="#b0">[1]</ref>.</p><p>During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure <ref type="figure" target="#fig_4">4</ref>).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Manual curation approach</head><p>In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.</p><p>We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in <ref type="bibr" target="#b17">[18]</ref>. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Curation guidelines</head><p>The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).</p><p>Differently from our previous work <ref type="bibr" target="#b17">[18]</ref>, these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure <ref type="figure" target="#fig_3">5</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Curation and processing logs</head><p>The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio <ref type="bibr" target="#b16">[17]</ref>. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure <ref type="figure" target="#fig_6">6 top</ref>). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).</p><p>The curation log provides a view of what, when and how a record has been corrected (Figure <ref type="figure" target="#fig_6">6</ref> bottom).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Results and evaluation</head><p>In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Anomaly detection rejection rate</head><p>We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical  formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Training data generation</head><p>We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.</p><p>We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in <ref type="bibr" target="#b0">[1]</ref>. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) <ref type="bibr" target="#b19">[20]</ref> library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:</p><p>(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).</p><p>The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting <ref type="bibr" target="#b20">[21]</ref> or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work <ref type="bibr" target="#b0">[1]</ref> and the evaluation scores are shown in Table <ref type="table">1</ref>.</p><p>This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table <ref type="table">2</ref>), we obtain an improvement of F1-score from 76.67% 2 to values between Table <ref type="table">1</ref>. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in <ref type="bibr" target="#b17">[18]</ref>, and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat <ref type="bibr" target="#b17">[18]</ref>. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Data quality</head><p>We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).</p><p>We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables <ref type="table">A1</ref> and<ref type="table" target="#tab_6">A2</ref>.</p><p>We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.1.">Discussion</head><p>Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.</p><p>We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table <ref type="table" target="#tab_4">3</ref>). The F1-score improved by 39.35%.</p><p>The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table <ref type="table" target="#tab_3">4</ref>). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).</p><p>Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table <ref type="table" target="#tab_5">5</ref>.</p><p>The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Code availability</head><p>This work is available at<ref type="url" target="https://github.com/lfoppiano/supercon2">https://github.com/lfoppiano/ supercon2</ref>. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table <ref type="table">2</ref>. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in <ref type="bibr" target="#b17">[18]</ref>, and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusions</head><p>We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database <ref type="bibr" target="#b0">[1]</ref>) before they are ingested into the existing, manually-build database of superconductors, SuperCon <ref type="bibr" target="#b7">[8]</ref>. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger <ref type="bibr" target="#b21">[22]</ref> dataset.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Notes</head><p>1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work <ref type="bibr" target="#b0">[1]</ref> we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issues<ref type="url" target="https://github.com/kermitt2/delft/issues/150">https://github.com/kermitt2/delft/issues/150</ref>. </p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 .</head><label>1</label><figDesc>Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).</figDesc><graphic coords="4,85.04,52.87,425.16,296.76" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 .</head><label>2</label><figDesc>Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio<ref type="bibr" target="#b16">[17]</ref>. The column 'status' indicate whether the example has been sent or not to the external tool.</figDesc><graphic coords="6,60.94,52.85,473.40,190.08" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 3 .</head><label>3</label><figDesc>Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.</figDesc><graphic coords="6,60.94,311.02,473.40,269.40" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 5 .</head><label>5</label><figDesc>Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.</figDesc><graphic coords="7,94.28,255.74,406.68,223.20" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 4 .</head><label>4</label><figDesc>Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.</figDesc><graphic coords="7,60.94,52.87,473.40,133.08" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>( 1 )</head><label>1</label><figDesc>base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6"><head>Figure 6 .</head><label>6</label><figDesc>Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.</figDesc><graphic coords="8,60.94,52.86,473.40,195.00" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0"><head/><label/><figDesc/><graphic coords="2,100.86,391.72,297.60,233.04" type="bitmap"/></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>c classification: The temperature is not correctly classified</head><label/><figDesc>The material is incorrectly linked to the T c given that the entities are correctly recognised.</figDesc><table><row><cell>• From table: the entities Material ! T c !</cell></row><row><cell>Pressure are identified in a table. At the moment,</cell></row><row><cell>table extraction is not performed</cell></row><row><cell>• Extraction: The material, temperature, and pressure</cell></row><row><cell>are not extracted (no box) or extracted incorrectly.</cell></row></table><note><p>• Linking: • T as 'superconductors critical temperature' (e.g. Curie temperature, Magnetic temperature. . .).</p></note></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 4 .</head><label>4</label><figDesc>Evaluation</figDesc><table><row><cell/><cell>base</cell><cell>base+curation</cell><cell>Δ</cell></row><row><cell>&lt;class&gt;</cell><cell>1646</cell><cell>1732</cell><cell>86</cell></row><row><cell>&lt;material&gt;</cell><cell>6943</cell><cell>7580</cell><cell>637</cell></row><row><cell>&lt;me_method&gt;</cell><cell>1883</cell><cell>1934</cell><cell>51</cell></row><row><cell>&lt;pressure&gt;</cell><cell>274</cell><cell>361</cell><cell>87</cell></row><row><cell>&lt;tc&gt;</cell><cell>3741</cell><cell>4269</cell><cell>528</cell></row><row><cell>&lt;tcValue&gt;</cell><cell>1099</cell><cell>1556</cell><cell>457</cell></row><row><cell>Total</cell><cell>15586</cell><cell>17432</cell><cell>1846</cell></row></table><note><p>scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.</p></note></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 3 .</head><label>3</label><figDesc>Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).</figDesc><table><row><cell>Method</cell><cell>P (%)</cell><cell>R (%)</cell><cell>F1%)</cell><cell># docs</cell></row><row><cell>PDF document</cell><cell>87.83</cell><cell>45.61</cell><cell>52.67</cell><cell>15</cell></row><row><cell>Interface</cell><cell>93.38</cell><cell>92.51</cell><cell>92.02</cell><cell>15</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5"><head>Table 5 .</head><label>5</label><figDesc>Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).</figDesc><table><row><cell>Experience</cell><cell>Method</cell><cell>P (%)</cell><cell>R (%)</cell><cell>F1%)</cell><cell># docs</cell><cell># pages</cell></row><row><cell>MS</cell><cell>PDF Document</cell><cell>94.58</cell><cell>36.55</cell><cell>48.67</cell><cell>6</cell><cell>46</cell></row><row><cell/><cell>Interface</cell><cell>83.19</cell><cell>95.83</cell><cell>88.25</cell><cell>4</cell><cell>50</cell></row><row><cell>PD</cell><cell>PDF Document</cell><cell>70.00</cell><cell>48.51</cell><cell>50.78</cell><cell>5</cell><cell>49</cell></row><row><cell/><cell>Interface</cell><cell>96.67</cell><cell>82.86</cell><cell>88.11</cell><cell>5</cell><cell>51</cell></row><row><cell>SR</cell><cell>PDF Document</cell><cell>100.00</cell><cell>55.56</cell><cell>61.03</cell><cell>4</cell><cell>51</cell></row><row><cell/><cell>Interface</cell><cell>97.42</cell><cell>98.33</cell><cell>97.78</cell><cell>6</cell><cell>45</cell></row></table></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_6"><head>Table A2 .</head><label>A2</label><figDesc>Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.</figDesc><table><row><cell>Document ID</cell><cell># pages</cell><cell>Method</cell><cell># TP</cell><cell># FP</cell><cell># FN</cell><cell>P</cell><cell>R</cell><cell>F1</cell></row><row><cell>Senior Researcher (SR)</cell><cell/><cell/><cell/><cell/><cell/><cell/><cell/><cell/></row><row><cell>0454e07f64</cell><cell>4</cell><cell>I</cell><cell>6</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>00c32076f4</cell><cell>13</cell><cell>P</cell><cell>8</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0c7d3163ea</cell><cell>9</cell><cell>I</cell><cell>13</cell><cell>1</cell><cell>0</cell><cell>92.86</cell><cell>100.00</cell><cell>96.30</cell></row><row><cell>0da5febabf</cell><cell>11</cell><cell>P</cell><cell>8</cell><cell>0</cell><cell>1</cell><cell>100.00</cell><cell>88.89</cell><cell>94.12</cell></row><row><cell>0012333581</cell><cell>13</cell><cell>I</cell><cell>11</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0aa1b3161f</cell><cell>5</cell><cell>I</cell><cell>9</cell><cell>0</cell><cell>1</cell><cell>100.00</cell><cell>90.00</cell><cell>94.74</cell></row><row><cell>0021fd339f</cell><cell>14</cell><cell>P</cell><cell>4</cell><cell>0</cell><cell>8</cell><cell>100.00</cell><cell>33.33</cell><cell>50.00</cell></row><row><cell>039105663f</cell><cell>9</cell><cell>I</cell><cell>11</cell><cell>1</cell><cell>0</cell><cell>91.67</cell><cell>100.00</cell><cell>95.65</cell></row><row><cell>02c4f00127</cell><cell>13</cell><cell>P</cell><cell>0</cell><cell>0</cell><cell>3</cell><cell>100.00</cell><cell>0.00</cell><cell>0.00</cell></row><row><cell>021c413172</cell><cell>5</cell><cell>I</cell><cell>15</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>PhD Student (PS)</cell><cell/><cell/><cell/><cell/><cell/><cell/><cell/><cell/></row><row><cell>02bf1b3db9</cell><cell>7</cell><cell>I</cell><cell>5</cell><cell>0</cell><cell>2</cell><cell>100.00</cell><cell>71.43</cell><cell>83.33</cell></row><row><cell>00b50fc0a8</cell><cell>11</cell><cell>P</cell><cell>2</cell><cell>0</cell><cell>7</cell><cell>100.00</cell><cell>22.22</cell><cell>36.36</cell></row><row><cell>02cbc58819</cell><cell>4</cell><cell>I</cell><cell>4</cell><cell>0</cell><cell>3</cell><cell>100.00</cell><cell>57.14</cell><cell>72.73</cell></row><row><cell>044939701d</cell><cell>12</cell><cell>P</cell><cell>4</cell><cell>0</cell><cell>2</cell><cell>100.00</cell><cell>66.67</cell><cell>80.00</cell></row><row><cell>08e1cb8f4f</cell><cell>16</cell><cell>I</cell><cell>5</cell><cell>1</cell><cell>1</cell><cell>83.33</cell><cell>85.71</cell><cell>84.51</cell></row><row><cell>0454e07f64</cell><cell>4</cell><cell>P</cell><cell>0</cell><cell>1</cell><cell>5</cell><cell>0.00</cell><cell>16.67</cell><cell>0.00</cell></row><row><cell>00c32076f4</cell><cell>13</cell><cell>I</cell><cell>8</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0c7d3163ea</cell><cell>9</cell><cell>P</cell><cell>9</cell><cell>0</cell><cell>5</cell><cell>100.00</cell><cell>64.29</cell><cell>78.26</cell></row><row><cell>0da5febabf</cell><cell>11</cell><cell>I</cell><cell>9</cell><cell>0</cell><cell>0</cell><cell>100.00</cell><cell>100.00</cell><cell>100.00</cell></row><row><cell>0012333581</cell><cell>13</cell><cell>P</cell><cell>4</cell><cell>4</cell><cell>3</cell><cell>50.00</cell><cell>72.73</cell><cell>59.26</cell></row><row><cell>Master Student (MS)</cell><cell/><cell/><cell/><cell/><cell/><cell/><cell/><cell/></row><row><cell>0aa1b3161f</cell><cell>5</cell><cell>P</cell><cell>1</cell><cell>0</cell><cell>9</cell><cell>100.00</cell><cell>10.00</cell><cell>18.18</cell></row><row><cell>0021fd339f</cell><cell>14</cell><cell>I</cell><cell>12</cell><cell>3</cell><cell>3</cell><cell>80.00</cell><cell>100.00</cell><cell>88.89</cell></row><row><cell>039105663f</cell><cell>9</cell><cell>P</cell><cell>4</cell><cell>1</cell><cell>7</cell><cell>80.00</cell><cell>41.67</cell><cell>54.79</cell></row><row><cell>02c4f00127</cell><cell>13</cell><cell>I</cell><cell>3</cell><cell>1</cell><cell>1</cell><cell>75.00</cell><cell>100.00</cell><cell>85.71</cell></row><row><cell>021c413172</cell><cell>5</cell><cell>P</cell><cell>7</cell><cell>1</cell><cell>7</cell><cell>87.50</cell><cell>53.33</cell><cell>66.27</cell></row><row><cell>02bf1b3db9</cell><cell>7</cell><cell>P</cell><cell>2</cell><cell>0</cell><cell>5</cell><cell>100.00</cell><cell>28.57</cell><cell>44.44</cell></row><row><cell>00b50fc0a8</cell><cell>11</cell><cell>I</cell><cell>7</cell><cell>2</cell><cell>0</cell><cell>77.78</cell><cell>100.00</cell><cell>87.50</cell></row><row><cell>02cbc58819</cell><cell>4</cell><cell>P</cell><cell>5</cell><cell>0</cell><cell>2</cell><cell>100.00</cell><cell>71.43</cell><cell>83.33</cell></row><row><cell>044939701d</cell><cell>12</cell><cell>I</cell><cell>5</cell><cell>0</cell><cell>1</cell><cell>100.00</cell><cell>83.33</cell><cell>90.91</cell></row><row><cell>08e1cb8f4f</cell><cell>16</cell><cell>P</cell><cell>1</cell><cell>0</cell><cell>6</cell><cell>100.00</cell><cell>14.29</cell><cell>25.00</cell></row></table></figure>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_0"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.</p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_1"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.</p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_2"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.</p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_3"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.</p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_4"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.</p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_5"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.</p></note>
+			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_6"><p>Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.</p></note>
+		</body>
+		<back>
+
+			<div type="acknowledgement">
+<div><head>Acknowledgements</head><p>Our warmest thanks to <rs type="person">Patrice Lopez</rs>, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank <rs type="person">Pedro Baptista de Castro</rs> for his support during this work. Special thanks to <rs type="person">Erina Fujita</rs> for useful tips on the manuscript.</p></div>
+			</div>
+			<div type="funding">
+<div><p>Materials Modelling Group, Data-driven <rs type="affiliation">Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba</rs>, Ibaraki <rs type="grantNumber">305-0044</rs>, Japan</p></div>
+			</div>
+			<div type="funding">
+<div><head>Funding</head><p>This work was partly supported by <rs type="funder">MEXT</rs> Program: Data Creation and <rs type="funder">Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)</rs> Grant Number [JPMXP1122715503].</p></div>
+			</div>
+			<listOrg type="funding">
+				<org type="funding" xml:id="_thsdDye">
+					<idno type="grant-number">305-0044</idno>
+				</org>
+			</listOrg>
+			<div type="annex">
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Disclosure statement</head><p>No potential conflict of interest was reported by the author(s).</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Author contribution</head><p>LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Appendix A. Evaluation</head><p>Table <ref type="table">A1</ref>. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes. </p></div>			</div>
+			<div type="references">
+
+				<listBibl>
+
+<biblStruct xml:id="b0">
+	<analytic>
+		<title level="a" type="main">Automatic extraction of materials and properties from superconductors scientific literature</title>
+		<author>
+			<persName><forename type="first">L</forename><surname>Foppiano</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><forename type="middle">B</forename><surname>Castro</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><forename type="middle">O</forename><surname>Suarez</surname></persName>
+		</author>
+		<idno type="DOI">10.1080/27660400.2022.2153633</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Sci Technol Adv Mater</title>
+		<imprint>
+			<biblScope unit="volume">3</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">2153633</biblScope>
+			<date type="published" when="2023">2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+	<analytic>
+		<title level="a" type="main">Materials discovery with machine learning and knowledge discovery</title>
+		<author>
+			<persName><forename type="first">O</forename><forename type="middle">N</forename><surname>Oliveira</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><forename type="middle">J</forename><surname>Oliveira</surname></persName>
+		</author>
+		<idno type="DOI">10.3389/fchem.2022.930369</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Front Chem</title>
+		<imprint>
+			<biblScope unit="volume">10</biblScope>
+			<biblScope unit="page">10</biblScope>
+			<date type="published" when="2022">2022</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+	<analytic>
+		<title level="a" type="main">Commentary: the materials project: a materials genome approach to accelerating materials innovation</title>
+		<author>
+			<persName><forename type="first">A</forename><surname>Jain</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><forename type="middle">P</forename><surname>Ong</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">G</forename><surname>Hautier</surname></persName>
+		</author>
+		<idno type="DOI">10.1063/1.4812323</idno>
+	</analytic>
+	<monogr>
+		<title level="j">APL Mater</title>
+		<imprint>
+			<biblScope unit="volume">1</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">11002</biblScope>
+			<date type="published" when="2013">2013</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+	<analytic>
+		<title level="a" type="main">Aflow: an automatic framework for high-throughput materials discovery</title>
+		<author>
+			<persName><forename type="first">S</forename><surname>Curtarolo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">W</forename><surname>Setyawan</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">G</forename><forename type="middle">L</forename><surname>Hart</surname></persName>
+		</author>
+		<ptr target="https://www.sciencedirect.com/science/article/pii/S0927025612000717"/>
+	</analytic>
+	<monogr>
+		<title level="j">Comput Mater Sci</title>
+		<imprint>
+			<biblScope unit="volume">58</biblScope>
+			<biblScope unit="page" from="218" to="226"/>
+			<date type="published" when="2012">2012</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+	<analytic>
+		<title level="a" type="main">The nomad laboratory: from data sharing to artificial intelligence</title>
+		<author>
+			<persName><forename type="first">C</forename><surname>Draxl</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Scheffler</surname></persName>
+		</author>
+		<idno type="DOI">10.1088/2515-7639/ab13bb</idno>
+	</analytic>
+	<monogr>
+		<title level="j">J Phys Mater</title>
+		<imprint>
+			<biblScope unit="volume">2</biblScope>
+			<biblScope unit="issue">3</biblScope>
+			<biblScope unit="page">36001</biblScope>
+			<date type="published" when="2019">2019</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+	<analytic>
+		<title level="a" type="main">Global publication productivity in materials science research: a scientometric analysis</title>
+		<author>
+			<persName><forename type="first">T</forename><surname>Pratheepan</surname></persName>
+		</author>
+		<ptr target="https://ojs.trp.org.in/index.php/ijiss/article/view/583"/>
+	</analytic>
+	<monogr>
+		<title level="j">Indian J Inf Sources Serv</title>
+		<imprint>
+			<biblScope unit="volume">9</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page" from="111" to="116"/>
+			<date type="published" when="2019-02">2019 Feb</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+	<monogr>
+		<title level="m" type="main">The PAULING FILE project and materials platform for data science: from big data toward materials genome</title>
+		<author>
+			<persName><forename type="first">E</forename><surname>Blokhin</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Villars</surname></persName>
+		</author>
+		<idno type="DOI">10.1007/978-3-319-42913-7_62-1</idno>
+		<imprint>
+			<date type="published" when="2018">2018</date>
+			<publisher>Springer International Publishing</publisher>
+			<biblScope unit="page" from="1" to="26"/>
+			<pubPlace>Cham</pubPlace>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+	<analytic>
+		<title level="a" type="main">Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases</title>
+		<author>
+			<persName><forename type="first">M</forename><surname>Ishii</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><surname>Sakamoto</surname></persName>
+		</author>
+		<idno type="DOI">10.1080/27660400.2023.2223051</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Sci Technol Adv Mater</title>
+		<imprint>
+			<biblScope unit="volume">3</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">2223051</biblScope>
+			<date type="published" when="2023">2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+	<analytic>
+		<title level="a" type="main">Predicting new superconductors and their critical temperatures using machine learning</title>
+		<author>
+			<persName><forename type="first">B</forename><surname>Roter</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Dordevic</surname></persName>
+		</author>
+		<idno type="DOI">10.1016/j.physc.2020.1353689</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Phys C</title>
+		<imprint>
+			<biblScope unit="volume">575</biblScope>
+			<biblScope unit="page">1353689</biblScope>
+			<date type="published" when="2020">2020</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+	<analytic>
+		<title level="a" type="main">Machine learning modeling of superconducting critical temperature</title>
+		<author>
+			<persName><forename type="first">V</forename><surname>Stanev</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><surname>Oses</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Kusne</surname></persName>
+		</author>
+		<idno type="DOI">10.1038/s41524-018-0085-8</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Npj Comput Mater</title>
+		<imprint>
+			<biblScope unit="volume">4</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">4</biblScope>
+			<date type="published" when="2017">2017</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+	<monogr>
+		<title level="m" type="main">Machine-learning approach for discovery of conventional superconductors</title>
+		<author>
+			<persName><forename type="first">H</forename><surname>Tran</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><forename type="middle">N</forename><surname>Vu</surname></persName>
+		</author>
+		<idno>arXiv:221103265. 2022</idno>
+		<imprint/>
+	</monogr>
+	<note type="report_type">arXiv preprint</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+	<analytic>
+		<title level="a" type="main">Deep learning model for finding new superconductors</title>
+		<author>
+			<persName><forename type="first">T</forename><surname>Konno</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">H</forename><surname>Kurokawa</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">F</forename><surname>Nabeshima</surname></persName>
+		</author>
+		<idno type="DOI">10.1103/PhysRevB.103.014509</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Phys Rev B</title>
+		<imprint>
+			<biblScope unit="volume">103</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page">14509</biblScope>
+			<date type="published" when="2021">2021</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+	<analytic>
+		<title level="a" type="main">The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation</title>
+		<author>
+			<persName><forename type="first">J</forename><forename type="middle">C</forename><surname>Klie</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Bugert</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">B</forename><surname>Boullosa</surname></persName>
+		</author>
+		<ptr target="https://www.aclweb.org/anthology/C18-2002"/>
+	</analytic>
+	<monogr>
+		<title level="m">Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations</title>
+		<meeting>the 27th International Conference on Computational Linguistics: System Demonstrations<address><addrLine>Santa Fe, New Mexico</addrLine></address></meeting>
+		<imprint>
+			<date type="published" when="2018">2018</date>
+			<biblScope unit="page" from="5" to="9"/>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+	<analytic>
+		<title level="a" type="main">Doccano: text annotation tool for human</title>
+		<author>
+			<persName><forename type="first">H</forename><surname>Nakayama</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><surname>Kubo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">J</forename><surname>Kamura</surname></persName>
+		</author>
+		<ptr target="https://github.com/doccano/doccano"/>
+	</analytic>
+	<monogr>
+		<title level="j">Software</title>
+		<imprint>
+			<date type="published" when="2018">2018</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+	<analytic>
+		<title level="a" type="main">Python materials genomics pymatgen: a robust open-source python library for materials analysis</title>
+		<author>
+			<persName><forename type="first">S</forename><forename type="middle">P</forename><surname>Ong</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">W</forename><forename type="middle">D</forename><surname>Richards</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Jain</surname></persName>
+		</author>
+		<idno type="DOI">10.1016/j.commatsci.2012.10.028</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Comput Mater Sci</title>
+		<imprint>
+			<biblScope unit="volume">68</biblScope>
+			<biblScope unit="issue">2</biblScope>
+			<biblScope unit="page" from="314" to="319"/>
+			<date type="published" when="2013">2013</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+	<monogr>
+		<title level="m" type="main">Text-mined dataset of inorganic materials synthesis recipes. Sci Data</title>
+		<author>
+			<persName><forename type="first">O</forename><surname>Kononova</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">H</forename><surname>Huo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">T</forename><surname>He</surname></persName>
+		</author>
+		<idno type="DOI">10.1038/s41597-019-0224-1</idno>
+		<idno>41597-019-0224-1</idno>
+		<ptr target="https://doi.org/10.1038/s"/>
+		<imprint>
+			<date type="published" when="2019-10">2019 Oct</date>
+			<biblScope unit="volume">6</biblScope>
+			<biblScope unit="page">203</biblScope>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+	<analytic>
+		<title level="a" type="main">Label studio: data labeling software; 2020-2022</title>
+		<author>
+			<persName><forename type="first">M</forename><surname>Tkachenko</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Malyuk</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Holmanyuk</surname></persName>
+		</author>
+		<ptr target="https://github.com/heartexlabs/label-studio"/>
+	</analytic>
+	<monogr>
+		<title level="m">Open source software</title>
+		<imprint/>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+	<analytic>
+		<title level="a" type="main">Supermat: construction of a linked annotated dataset from superconductors-related publications</title>
+		<author>
+			<persName><forename type="first">L</forename><surname>Foppiano</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Dieb</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Suzuki</surname></persName>
+		</author>
+		<idno type="DOI">10.1080/27660400.2021.1918396</idno>
+	</analytic>
+	<monogr>
+		<title level="j">Sci Technol Adv Mater: Methods</title>
+		<imprint>
+			<biblScope unit="volume">1</biblScope>
+			<biblScope unit="issue">1</biblScope>
+			<biblScope unit="page" from="34" to="44"/>
+			<date type="published" when="2021">2021</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+	<analytic>
+		<title level="a" type="main">SciBERT: a pretrained language model for scientific text</title>
+		<author>
+			<persName><forename type="first">I</forename><surname>Beltagy</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><surname>Lo</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Cohan</surname></persName>
+		</author>
+		<ptr target="https://aclanthology.org/D19-1371"/>
+	</analytic>
+	<monogr>
+		<title level="m">Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</title>
+		<meeting>the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing<address><addrLine>Hong Kong; China</addrLine></address></meeting>
+		<imprint>
+			<publisher>Association for Computational Linguistics</publisher>
+			<date type="published" when="2019-11">Nov. 2019</date>
+			<biblScope unit="page" from="3615" to="3620"/>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+	<analytic>
+		<title/>
+		<ptr target="https://github.com/kermitt2/delft"/>
+	</analytic>
+	<monogr>
+		<title level="j">DeLFT contributors. Delft</title>
+		<imprint>
+			<date type="published" when="2018">2018-2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+	<analytic>
+		<title level="a" type="main">Overcoming catastrophic forgetting in neural networks</title>
+		<author>
+			<persName><forename type="first">J</forename><surname>Kirkpatrick</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">R</forename><surname>Pascanu</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">N</forename><forename type="middle">C</forename><surname>Rabinowitz</surname></persName>
+		</author>
+		<idno>abs/1612.00796</idno>
+		<ptr target="http://arxiv.org/abs/1612.00796"/>
+	</analytic>
+	<monogr>
+		<title level="j">CoRr</title>
+		<imprint>
+			<date type="published" when="2016">2016</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+	<monogr>
+		<title/>
+		<author>
+			<persName><forename type="first">G</forename><surname>Contributors</surname></persName>
+		</author>
+		<author>
+			<persName><surname>Grobid</surname></persName>
+		</author>
+		<ptr target="https://github.com/kermitt2/grobid"/>
+		<imprint>
+			<date type="published" when="2008">2008 -2023</date>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+				</listBibl>
+			</div>
+		</back>
+	</text>
+</TEI>
\ No newline at end of file