From e8403a0783b4e9a0d37688a3962b86bc948fff2c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 8 May 2024 16:27:12 +0900 Subject: [PATCH 1/7] Avoid adding spaces introduced before a punctuation, adding tests --- build.gradle | 3 + .../org/pub2tei/document/XMLUtilities.java | 59 +- .../document/XMLUtilitiesIntegrationTest.java | 96 ++ .../document/document1.segmented.tei.xml | 1413 +++++++++++++++++ .../org/pub2tei/document/document1.tei.xml | 540 +++++++ .../document/document2.segmented.tei.xml | 760 +++++++++ .../org/pub2tei/document/document2.tei.xml | 760 +++++++++ 7 files changed, 3610 insertions(+), 21 deletions(-) create mode 100644 src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java create mode 100644 src/test/resources/org/pub2tei/document/document1.segmented.tei.xml create mode 100644 src/test/resources/org/pub2tei/document/document1.tei.xml create mode 100644 src/test/resources/org/pub2tei/document/document2.segmented.tei.xml create mode 100644 src/test/resources/org/pub2tei/document/document2.tei.xml diff --git a/build.gradle b/build.gradle index 358e1da..cc726d4 100644 --- a/build.gradle +++ b/build.gradle @@ -106,6 +106,9 @@ dependencies { implementation "io.dropwizard.metrics:metrics-core:4.0.5" implementation "io.dropwizard.metrics:metrics-servlets:4.0.5" + testImplementation "org.xmlunit:xmlunit-matchers:2.10.0" + testImplementation "org.xmlunit:xmlunit-legacy:2.10.0" + implementation 'org.slf4j:slf4j-api:1.7.30' implementation 'ch.qos.logback:logback-classic:1.2.3' diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java index 9b431a5..845f5e6 100644 --- a/src/main/java/org/pub2tei/document/XMLUtilities.java +++ b/src/main/java/org/pub2tei/document/XMLUtilities.java @@ -1,26 +1,35 @@ package org.pub2tei.document; -import java.io.*; -import java.util.*; -import javax.xml.parsers.*; -import javax.xml.transform.*; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; -import javax.xml.namespace.NamespaceContext; -import javax.xml.xpath.*; - import net.sf.saxon.om.NameChecker; - -import org.w3c.dom.*; -import org.xml.sax.*; - +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.SentenceUtilities; - -import org.apache.commons.io.FileUtils; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.w3c.dom.*; +import org.xml.sax.ErrorHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXParseException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactory; +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; /** * Some convenient methods for suffering a bit less with XML @@ -222,11 +231,19 @@ public static void segment(org.w3c.dom.Document doc, Node node) { (textualElements.contains(n.getNodeName())) ) { // text content - StringBuffer textBuffer = new StringBuffer(); + StringBuilder textBuffer = new StringBuilder(); NodeList childNodes = n.getChildNodes(); for(int y=0; y 0) { + String firstChar = "" + serialize(doc, item).charAt(0); + //We might need to use TextUtilities.fullPunctuation + if (!Pattern.matches("\\p{Punct}", firstChar)) { + textBuffer.append(" "); + } + } + + textBuffer.append(serialize(doc, item)); } String text = textBuffer.toString(); List theSentenceBoundaries = SentenceUtilities.getInstance().runSentenceDetection(text); @@ -238,8 +255,8 @@ public static void segment(org.w3c.dom.Document doc, Node node) { //System.out.println("new chunk: " + sent); String sent = text.substring(sentPos.start, sentPos.end); String newSent = sent; - if (toConcatenate.size() != 0) { - StringBuffer conc = new StringBuffer(); + if (CollectionUtils.isNotEmpty(toConcatenate)) { + StringBuilder conc = new StringBuilder(); for(String concat : toConcatenate) { conc.append(concat); conc.append(" "); diff --git a/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java b/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java new file mode 100644 index 0000000..91645fd --- /dev/null +++ b/src/test/java/org/pub2tei/document/XMLUtilitiesIntegrationTest.java @@ -0,0 +1,96 @@ +package org.pub2tei.document; + +import com.sun.tools.javac.util.List; +import org.grobid.core.main.GrobidHomeFinder; +import org.grobid.core.utilities.GrobidProperties; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.InputSource; +import org.xmlunit.matchers.CompareMatcher; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.InputStream; +import java.io.StringReader; + +import static org.hamcrest.MatcherAssert.assertThat; + +public class XMLUtilitiesIntegrationTest { + + @Before + public void setUp() throws Exception { + //This test requires to have grobid deployed somewhere under these directories + GrobidHomeFinder finder = new GrobidHomeFinder(List.of("../grobid-home", "../../grobid/grobid-home")); + GrobidProperties.getInstance(finder); + } + + @Test + public void testSegment_chunk_shouldInjectSegmentCorrectly() throws Exception { + String input = "
" + + "
" + + "Acknowledgements" + + "

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

" + + "
" + + "
"; + + String expected = "
\n" + + "\t
\n" + + "\t\tAcknowledgements\n" + + "\t\t

\n" + + "\t\t\tOur warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.\n" + + "\t\t\tWe thank Pedro Baptista de Castro for his support during this work.\n" + + "\t\t\tSpecial thanks to Erina Fujita for useful tips on the manuscript.\n" + + "\t\t

\n" + + "\t
\n" + + "
"; + + + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(false); + DocumentBuilder builder = factory.newDocumentBuilder(); + + org.w3c.dom.Document document = builder.parse(new InputSource(new StringReader(input))); + + XMLUtilities.segment(document, document.getDocumentElement()); + + assertThat(XMLUtilities.serialize(document, document.getDocumentElement()), CompareMatcher.isIdenticalTo(expected.replace("\t"," "))); + } + + + @Test + public void testSegment_document1_shouldInjectSegmentCorrectly() throws Exception { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(false); + DocumentBuilder builder = factory.newDocumentBuilder(); + + InputStream resourceAsStream = this.getClass().getResourceAsStream("document1.tei.xml"); + org.w3c.dom.Document document = builder.parse(new InputSource(resourceAsStream)); + + InputStream resourceAsStreamSegmented = this.getClass().getResourceAsStream("document1.tei.xml"); + org.w3c.dom.Document documentSegmented = builder.parse(new InputSource(resourceAsStreamSegmented)); + + XMLUtilities.segment(document, document.getDocumentElement()); + String documentResult = XMLUtilities.serialize(document, document.getDocumentElement()); + String documentExpected = XMLUtilities.serialize(documentSegmented, document.getDocumentElement()); + assertThat(documentResult, CompareMatcher.isIdenticalTo(documentExpected)); + } + + @Test + public void testSegment_document2_shouldInjectSegmentCorrectly() throws Exception { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(false); + DocumentBuilder builder = factory.newDocumentBuilder(); + + InputStream resourceAsStream = this.getClass().getResourceAsStream("document2.tei.xml"); + org.w3c.dom.Document document = builder.parse(new InputSource(resourceAsStream)); + + InputStream resourceAsStreamSegmented = this.getClass().getResourceAsStream("document2.segmented.tei.xml"); + org.w3c.dom.Document documentSegmented = builder.parse(new InputSource(resourceAsStreamSegmented)); + + XMLUtilities.segment(document, document.getDocumentElement()); + String documentResult = XMLUtilities.serialize(document, document.getDocumentElement()); + String documentExpected = XMLUtilities.serialize(documentSegmented, document.getDocumentElement()); + assertThat(documentResult, CompareMatcher.isIdenticalTo(documentExpected)); + } + +} \ No newline at end of file diff --git a/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml b/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml new file mode 100644 index 0000000..aade875 --- /dev/null +++ b/src/test/resources/org/pub2tei/document/document1.segmented.tei.xml @@ -0,0 +1,1413 @@ + + + + + + Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos + + NASA + + + Commercialisation of Rice Farming in the Lower Mekong Basin (Palgrave Macmillan + + + NASA Carbon Monitoring System + + + unknown + + + + + + + + 13 October 2023 + + + + + + + Shijuan + Chen + + shijuan.chen@yale.edu + + Yale School of the Environment + Yale University +
+ New Haven + CT + United States of America +
+
+ + Department of Earth and Environment + Boston University +
+ Boston + MA + United States of America +
+
+
+ + + Curtis + E + Woodcock + + + Department of Earth and Environment + Boston University +
+ Boston + MA + United States of America +
+
+
+ + + Thatheva + Saphangthong + + + Department of Agriculture Land Management + Ministry of Agriculture and Forestry +
+ Vientiane + Laos +
+
+
+ + + Pontus + Olofsson + + + Department of Earth and Environment + Boston University +
+ Boston + MA + United States of America +
+
+ + NASA Marshall Space Flight Center +
+ Huntsville + AL + United States of America +
+
+
+ Satellite data reveals a recent increase in shifting cultivation and associated carbon emissions in Laos +
+ + + 13 October 2023 + + + 17112CCE7BFA5F63FB9BFE897A9E1A85 + 10.1088/1748-9326/acffdd +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + shifting cultivation + shifting agriculture + slash and burn + swidden agriculture + forest degradation + carbon emissions + deforestation + + + +
+

+ Although shifting cultivation is the major land use type in Laos, the spatial-temporal patterns and the associated carbon emissions of shifting cultivation in Laos are largely unknown. + This study provides a nationwide analysis of the spatial-temporal patterns of shifting cultivation and estimations of the associated carbon emissions in Laos over the last three decades. + This study found that shifting cultivation has been expanding and intensifying in Laos, especially in the last 5 years. + The newly cultivated land from 2016 to 2020 accounted for 4.5% (±1.2%) of the total land area of Laos. + Furthermore, the length of fallow periods has been continuously declining, indicating that shifting cultivation is becoming increasingly intensive. + Combining biomass derived from Global Ecosystem Dynamics Investigation and shifting cultivation maps and area estimates, we found that the net carbon emissions from shifting cultivation declined in 2001-2015 but increased in 2016-2020. + The largest carbon source is conversion from intact forests to shifting cultivation, which contributed to 89% of the total emissions from 2001 to 2020. + In addition, there were increased emissions from intensified use of fallow lands. + This research provides useful information for policymakers in Laos to understand the changes in shifting cultivation and improve land use management. + This study not only supports Reducing Emissions from Deforestation and Forest Degradation reporting for Laos but also provides a methodology for tracking carbon emissions and removals of shifting cultivation. +

+
+
+
+
+ + +
+ Introduction +

+ Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation. + It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation. + After short-term cultivation, the plot is abandoned, which allows the vegetation to recover. + Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017 + , Curtis et al 2018 + , Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015). + Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small. + Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce. +

+

+ Shifting cultivation has both short-term and long-term effects on carbon emissions (Ziegler et al 2012). + In the short term, the slash-and-burn activities cause immediate release of carbon. + In the long term, encroachment of shifting cultivation into primary forest and intensified use of secondary forest both lead to long-term increases in net carbon emissions and degradation of ecosystems. + Carbon emissions from shifting cultivation have not been well quantified, because of the lack of methodology for monitoring shifting cultivation and tracking the associated carbon dynamics. + In contrast to deforestation (such as urbanization), which does not involve carbon sequestration, shifting cultivation involves both carbon emissions associated with slash-and-burn activities and carbon sequestration during the fallow period. + Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting. +

+

+ In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001 + , Douangsavanh et al 2006 + , Epprecht et al 2018 + , Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018 + , Chen et al 2023). + It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years (Chen et al 2023). + Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020. + Shifting cultivation activities are expected to increase due to the increasing demand for rice. + Monitoring shifting cultivation and analyzing its patterns are important to understand the forest cover change in Laos and relevant to achieving Laos' goal of increasing forest cover to 70% (The current forest cover is 62%) (The Government of Lao PDR 2005). + Since there were few spatially and temporally explicit maps and estimates of shifting cultivation before Chen et al (2023), carbon emissions from shifting cultivation have not been accurately estimated in the REDD+ reporting of Laos (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018). +

+

+ Spatially and temporally explicit information about shifting cultivation in Laos was unavailable until recently (Chen et al 2023), and a comprehensive national-scale analysis of the spatial and temporal patterns of shifting cultivation has not been conducted to date. + A traditional approach for mapping shifting cultivation is to create landscape mosaics based on a land cover map of a single year (Messerli et al 2009 + , Silva et al 2011 + , Hett et al 2012 + , Hurni et al 2013a). + It is impossible to analyze the temporal patterns of shifting cultivation using this traditional approach. + Another approach is to use multi-temporal land cover data to map shifting cultivation (Leisz and Rasmussen 2012 + , Molinario et al 2015 + , Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018 + , Adhikary et al 2019 + , Kurien et al 2019). + In previous studies, the temporal resolution of the land cover maps was not high enough to support the analysis of temporal patterns (Heinimann et al 2013). + Recently, Chen et al (2023) used satellite data to create shifting cultivation products for Laos with sufficient temporal frequency (annual) and spatial resolution (30 m) to support a nationalscale spatial-temporal analysis. + The recently launched GEDI (Global Ecosystem Dynamics Investigation) mission provides new opportunities for estimating biomass at a large scale (Tang et al 2020). +

+

+ This study used the map products and reference data in Chen et al (2023), combined with GEDI, to conduct a national-scale analysis of the spatial and temporal patterns and carbon dynamics of shifting cultivation in Laos. + The goal is to understand the spatial and temporal patterns of shifting cultivation and the associated carbon emissions, in support of decision-making to reduce carbon emissions and promote sustainable livelihoods depending on shifting cultivation. +

+
+
+ Method +
+
+ Mapping shifting cultivation +

+ Shifting cultivation was mapped using Landsat data from 1987 to 2020 on Google Earth Engine (Chen et al 2023). + CCDC-SMA (continuous change detection and classification-spectral mixture analysis) (Zhu and Woodcock 2014, Bullock et al 2020, Chen et al 2021) was used to detect forest disturbances in Laos. + CCDC-SMA fits harmonic models to fractions of endmembers and NDFI (Normalized Difference Fraction Index) (Souza et al 2005) to monitor forest disturbances (figure 1). + Annual maps of Shifting Cultivation from 1991 to 2020 were created by combining time series analysis, object-based image analysis, and post-disturbed land-cover classification. + A total of 1000 sample units under simple random sampling were used as reference data for accuracy assessment and area estimation. + For each sample unit, at least two interpreters interpreted the land change class and the year of each slash-and-burn event by examining high-resolution satellite imagery and Landsat time series (figures 1(a), (c) and 2). + During 1991-2020, shifting cultivation was the main type of forest disturbance in Laos, affecting 32.9 ± 1.9% of Laos (Chen et al 2023). + Shifting cultivation was mapped with a producer's accuracy of 88% and a user's accuracy of 80% (Chen et al 2023). + + Chen et al (2023) describes more details of the monitoring method. + Both the map products and the reference data from Chen et al (2023) were used in this study. +

+
+
+ Spatial-temporal patterns of shifting cultivation +

+ The annual maps of shifting cultivation and the reference sample units interpreted as Shifting Cultivation were used to investigate the patterns of shifting cultivation. + We estimated the area of shifting cultivation at different fallow and disturbance 55 ′ 27 ′′ E. In the time series plot, the blue points are the Landsat observations. + In the Landsat images (Red-green-blue), the yellow squares show the pixel location. + In the high-resolution image, the white point shows the pixel location.). +

+

+ to explore whether the extent of shifting cultivation expanded, the newly and previously cultivated areas of shifting cultivation were estimated using reference sample units and maps in Chen et al (2023) for every 5 year period from 2000 to 2020. + In the reference sample points, whether a pixel is newly or previously cultivated is determined by the year of slash-andburn recorded by the interpreters (e.g. + figures 1(a), (c) and 2). +

+

+ Furthermore, to investigate the change patterns in fallow length and cultivation length (length of cropping period), we visually interpreted Landsat time series, Landsat imagery, and high-resolution images for 196 sample points (figure 3 as an example). + These sample points are the points with at least two cultivation events in the aforementioned reference data with 1000 simple random sample points. + For each point, the year of slash and burn (land clearing), cultivation length, and fallow length are recorded for every event. +

+
+
+ Carbon emission/removal +

+ The GEDI mission provides space-borne LiDAR data to estimate aboveground biomass (Healey et al 2020). + GEDI's L4A Footprint Level Aboveground Biomass Density (AGBD) (version 2.1) 25 m data (Beck et al 2020 + , Dubayah et al 2022) were used to explore the effect of shifting cultivation on biomass. + GEDI data collected in 2020 was used because it was the only year of data with good spatial coverage when the study was conducted. + To overlay the GEDI footprint and Landsat, for each GEDI footprint, we extracted the value of the 30 m pixel in the Landsat-based map that has the largest overlap with the 25 m footprint. + Only lidar observations with good quality (using the 'quality_flag' band and the 'degrade_flag' band) and collected at places with a slope less than 20 • and in the interior of shifting cultivation sites (excluding a two-pixel edge) were used, to eliminate the effect of terrain and possible misregistration at the edges of slash-and-burn events. + The reason why we excluded lidar points with slopes larger than 20 • is that GEDIbased biomass estimates tend to be overestimated at steep terrain. + AGBD was calculated for Active Shifting Cultivation, Inactive Shifting Cultivation, Intact Forest, and Others. + Intact Forest here is defined as forests without significant anthropogenic disturbances. + The relationship between AGBD and years of regrowth since the latest slash-and-burn events was analyzed. + The hypothesis was that AGBD has a positive relationship with years of regrowth since the latest slash-andburn activity. + From this relationship, a country-level growth curve of AGBD can be developed and used to estimate the biomass of fallow lands. +

+

+ Carbon emissions from shifting cultivation were estimated for every 5 year period from 2001 to 2020. + Table 1 shows the activity classes, definitions, and emission factors. + New Shifting Cultivation area was estimated from a sampling-based method The emission factors for activities other than New Shifting Cultivation are spatially explicit and were determined by the map of the latest year of slash and burn and the growth curve. + Figure 8 shows an example of the spatially explicit emission factors for different activities. + Specifically, this was how the carbon emissions and removals of Fallow land -> Fallow land, Fallow land -> Cleared land, and Cleared land -> Fallow land were calculated: The latest year of disturbance of Fallow land was determined using the annual shifting cultivation maps. + Then, the AGBD of fallow lands was calculated using equation (1). + Using AGBD of fallow land in the end year minus AGBD in the start year of each period, the differences in AGBD were obtained. + Multiply the differences in AGBD by the area of different activities and then multiply it by the conversion factor (0.5), and the carbon emissions and removals of each activity were calculated. + The average emission/removal factors were calculated using the emissions and removals divided by the total area of activities in different categories. +

+
+
+ Results +
+
+ Spatial-temporal patterns of shifting cultivation +

+ A large proportion of the land used for shifting cultivation in Laos remains in use. + During our study period, the estimated area of Active Shifting Cultivation (19.1 ± 1.6%) exceeded the area of Inactive Shifting Cultivation (13.7 ± 1.8%). + In the future, there is a possibility of reusing Inactive Shifting Cultivation and further increasing the area of Active Shifting Cultivation, given the increasing demand for crops. + New Shifting Cultivation, defined as shifting cultivation that first occurred in each period, was estimated from 2001 to 2020 by period (figure 4). + The area estimates were aggregated into 5 year periods instead of calculating annual to reduce uncertainties of the area estimates. + From 1991 to 2000, it is difficult to tell whether the shifting cultivation areas were new or old, and thus this analysis started in 2001. + In all 5 year periods, the area of New Shifting Cultivation is higher than 3% of Laos, implying that on average, over 0.6% of Laos' land area is converted from intact forest to shifting cultivation each year. + Our results indicate that the extent of shifting cultivation has been expanding. +

+

+ During 2001-2015, there was a decrease in the area of New Shifting Cultivation. + However, both the area of New Shifting Cultivation and the total area of Shifting Cultivation have increased significantly in 2016-2020. + The proportion of previously and newly cultivated to the total area of shifting cultivation was calculated for every year using the annual maps (figure 5). + Before 2007, the newly cultivated areas were larger than the previously cultivated, and the trend reversed after 2007. + There was a general decreasing trend in the proportion of New Shifting Cultivation, but increases were observed in 2019 and 2020. + We suppose that the general decreasing trend is because intact forests available for cultivation decreased over time and previously cultivated land is easier to clear for future cultivation. +

+

+ Based on the sample interpretation results, most cultivation lengths are either one year or two years. + Although there are variations across the years, we have not seen major changes in average cultivation length (figures S1 and S2). + The mean length of the fallow periods of shifting cultivation in Laos is 6.5 years, which is close to the length of fallow periods reported in the literature (7 years) (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018). + The fallow length has been continuously declining (figure 6). + The reduction in the length of fallow periods indicates that shifting cultivation has intensified. +

+
+
+ Growth curve of fallow lands +

+ The AGBD was lower in shifting cultivation regions than in the intact forests. + The median AGBD of Intact Forest, Inactive Shifting Cultivation, Active Shifting Cultivation, and Others are 151.9 + Mg ha -1 , 87.9 Mg ha -1 , 39.5 Mg ha -1 , and 22.8 Mg ha -1 , respectively. + The biomass of Inactive Shifting Cultivation only reached about 60% of that of the intact forest. + In the literature (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018), the regions of Inactive Shifting Cultivation were considered to be 'recovered' , whereas our results show that the AGBD is not recovered even if these regions have been left for fallow for at least seven years. + To investigate the relationship between median AGBD and disturbance history, a logarithmic regression was conducted on years of regrowth since the latest slash-and-burn events and median AGBD of GEDI footprints (figure 7). + The logarithmic model of years of regrowth (x) and AGBD (y) is (R square is 0.93): y = 29.129 + ln (x) + 9.907 +

+

+ (1) +

+

+ AGBD was strongly correlated with years of regrowth. + Equation ( 1) and the maps of years of regrowth were used to calculate the biomass of fallow lands and spatially explicit emission/removal factors (figure 8). in this period (figure 5) and the decrease in carbon sink of fallow lands in this period. + For every period, New Shifting Cultivation is the largest carbon source, contributing to more than 80% of the total emissions. + From 2001 to 2020, New Shifting Cultivation contributed to 89% of the total emissions. + Fallow lands are important carbon sinks and sequestered about 70% of the total emissions during 2006-2015. + However, carbon sequestration of fallow lands also decreased in recent years because of the intensified use of fallow land. + To summarize, the increase in emissions from shifting cultivation encroachment to intact forests (New Shifting Cultivation) and intensified use of secondary forests both led to the recent increase in net emissions from shifting cultivation. +

+
+
+ Carbon emissions from shifting cultivation +
+
+ Discussion +

+ In this study, the spatial-temporal patterns and the carbon dynamics of shifting cultivation in Laos were analyzed. + The results showed that shifting cultivation has been expanding and intensifying. + The area of shifting cultivation has increased significantly over the last 5 years. + The fallow length has been declining continuously, which indicates the intensification of shifting cultivation. + Our finding of a reduction of fallow length is consistent with previous local studies (Rasul and Thapa 2003 + , Saphangthong and Kono 2009 + , van Vliet et al 2012). + We found that AGBD was strongly correlated with years of regrowth since the latest year of slash-and-burn activities, which can be 2019) is understandable since their major focus is forest loss instead of shifting cultivation. + This comparison is not a criticism of the aforementioned studies. + Instead, it highlights the benefits of using shifting cultivation maps and reference samples with better spatial resolution and high temporal frequency for the analysis of spatial-temporal patterns. +

+

+ We compared our area estimates of New Shifting Cultivation with the official forest change statistics from Laos (table S1). + The Laos official forest change maps (https://nfms.maf.gov.la/) are created from the land cover classification maps from the start year and end year for each period (see the periods in table S1). + Since shifting cultivation is the major driver of forest degradation and deforestation in Laos, we expect that there are some consistencies between the areas of New Shifting cultivation and the areas of forest degradation and deforestation. + There are consistencies in the period 2006-2010 and 2011-2015, with the differences between our estimates and the official statistics both less than 1% of Laos. + Our estimates of New Shifting Cultivation are generally higher than the Laos official estimates of deforestation and forest degradation, except for 2006-2010. + This was partly due to the different monitoring approaches. + Without using dense time series, the shifting cultivation events that occurred over five years may be difficult to detect using two classification maps from the start and the end. + In the period 2001-2005 and 2016-2020, our estimates are about 2%-3% higher than the official estimates. + For 2016-2020, the discrepancy is partly because the 2019 and 2020 changes are included in our estimates but not in the official statistics. + Overall, our results and area estimates provide valuable information regarding the forest dynamics of Laos. +

+

+ Furthermore, we compared the shifting cultivation map with the field survey data in the Laos National Forest Monitoring Systemhttps://nfms. maf.gov.la/. + The shifting cultivation map was compared with 39 field points identified as 'Regenerating Vegetation' or 'Upland crop' in 2010 + , 2011 + , 2012 + , or 2019, since these two land cover classes are generally considered to have an association with shifting cultivation practices (Department of Forestry 2020). + The 31 out of 39 (80%) points are correctly mapped as shifting cultivation. +

+

+ As a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos, our research is valuable to sustainable land resource management. + The sustainability of the land is negatively impacted by the recent expansion and intensification of shifting cultivation, indicated by an increase in newly cultivated areas in 2016-2020 and a reduction of fallow length in 1991-2020. + Moreover, our research provides a quantitative analysis of carbon emissions of shifting cultivation, which is crucial for REDD+ reporting in Laos. + Our research indicates that carbon emissions from shifting cultivation can be quantified by combining GEDI data with shifting cultivation maps and area estimates. + The fallow land sequestrated a significant amount of carbon in the past, but this carbon sink declined in recent years. + The recent increase in new shifting cultivation events also led to an increase in net carbon emissions. + This highlights the importance of protecting the primary forest from the encroachment of new shifting cultivation and the restoration of old fallow lands. +

+

+ Our study has several limitations and future research can make improvements by using more sophisticated models and integration with other data. + The first limitation is the usage of GEDI data. + Our research only used GEDI in one year (2020), because GEDI is a new mission and 2020 was the only year with good coverage data when the study was conducted. + Future studies can use GEDI for multiple years as more data will be collected. + In addition, we excluded GEDI points where the slope is larger than 20 • to avoid overestimation of biomass in steep terrain. + This would introduce regional bias on the growth curve and emission factors. + Based on our map, 69% of the shifting cultivation area is in places with slopes less than 20 • (Chen 2022). + Future research should improve GEDI biomass estimates in steep terrain. + Second, although we compared our map with some field survey data in Laos, the field data information for each location is limited. + Future studies should collect more detailed information on shifting cultivation in field surveys, especially biomass in shifting cultivation landscapes (e.g. + Salinas-Melgoza et al 2017, Borah et al 2018 + , Gogoi et al 2020 ). + Third, the carbon estimation only considered aboveground biomass change and no other carbon pools due to a lack of field survey data on those carbon pools. + Future research can conduct field surveys on belowground biomass and include the belowground carbon pools in carbon emission estimation. + Fourth, future research should investigate the causes of the recent increase in shifting cultivation, which requires field surveys. +

+
+
+ Conclusion +

+ Our research provides a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos. + Our analysis shows that shifting cultivation in Laos has been expanding and intensifying, particularly in the recent 5 years. + The practice of shifting cultivation has become increasingly intensive as the length of the fallow periods has been continuously shortening. + Combining GEDI data with shifting cultivation maps and area estimates, carbon emissions from shifting cultivation can be quantified. + The net carbon emissions from shifting cultivation declined in the past but increased recently. + This study not only supports REDD+ reporting for Laos but also demonstrates a method of tracking carbon dynamics in shifting cultivation landscapes. +

+

+ USGS Landsat Science Team Program for Better Use of the Landsat Temporal Domain: Monitoring Land Cover Type, Condition and Change (Grant Number: G12PC00070). + The authors are grateful to the editors and two anonymous reviewers for their insightful and constructive comments, which greatly helped to improve this paper. +

+
+
+ Figure 1 . + + +
+

+ Figure 1. + An example of active shifting cultivation in previously cultivated land (location: 20 • 7 ′ 13 ′′ N, 101 • 6 ′ 59 ′′ E). + The shifting cultivation events in 2005 and 2018 were categorized as Previous Shifting Cultivation because shifting cultivation first occurred in 1991. + This place is also Active Shifting Cultivation because the latest shifting cultivation event occurred in 2018. + (a) Landsat time series. + (b) CCDC-SMA model fits. + Different colors show different segments and the model breaks in 1991, 2005, and 2018 show slash and burn events. + The colored lines show the seasonality of the forest and the drops between lines show slash-and-burn events. + (c) Landsat images and high-resolution images on Google Earth. + In the Landsat images (red-green-blue), the yellow squares show the pixel location. + In the high-resolution image, the white point shows the pixel location. +

+
+
+ +
+
+ Figure 2 .Figure 3 . + + +
+

+ Figure 2. + An example of reference data (location: 20 • 15 ′ 8 ′′ N, 100• 39 ′ 51 ′′ E). + This shifting cultivation is New Shifting Cultivation. + The time series shows that no shifting cultivation occurred before 2019. + The new shifting cultivation event occurred in 2019 and it can be verified by examining high-resolution images and Landsat images. + (In the time series figure, the blue points are Landsat observations. + In the Landsat images (red-green-blue), the yellow squares show the pixel location. + In the high-resolution image, the white point shows the pixel location). +

+
+
+ +
+
+ Figure 4 . + + +
+

+ Figure 4. Area estimates and uncertainties of New Shifting Cultivation and total (new and previous) shifting cultivation by 5 year intervals.The y-axis is the area proportions of the total area of Laos (230 405 km 2 ). + Any pixel that was newly cultivated at any time within a specified 5 year period would be counted and added to the total height of the corresponding pink bars. +

+
+
+ +
+
+ Figure 5 . + + +
+

+ Figure 5. Annual proportion of slash-and-burn areas in previously and newly cultivated regions. +

+
+
+ +
+
+ Figure 6 . + + +
+

+ Figure 6. + Average fallow length by year calculated from sample interpretation. +

+
+
+ +
+
+ Figure 7 . + + +
+

+ Figure 7. Growth curve of aboveground biomass density. +

+
+
+ +
+
+ +
+
+ Figure 8 . + + +
+

+ Figure 8. Spatially explicit emission (+)/removal (-) factors for different activities in a region (the background image is the high-resolution image): (a) fallow land -> fallow land; (b) fallow land -> cleared land; (c) cleared land -> fallow land; (d) total of (a)-(c). +

+
+
+ +
+
+ Figure 9 . + + +
+

+ Figure 9. Carbon dynamics by period. +

+
+
+ +
+
+ +
+
+ Table 1 . + + +
+

+ Activity classes, definitions, and carbon emission/removal factors for each 5 year period (CF: conversion factor to convert biomass to carbon equivalents, CF = 0.5). +

+
+
+ + + Activity class + Definition + Emission/removal factors + + + Intact forest -> shifting cultivation + No shifting cultivation before. Previous intact + Biomass of forest before + + + (New Shifting Cultivation) + forests began to be used for new shifting + new shifting + + + + cultivation. + cultivation × CF (75.95 + + + + + Mg C ha -1 ) + + + Fallow land -> fallow land + Shifting cultivation occurred before. The start + (Fallow land biomass in the + + + + and end land cover were both fallow lands. + start -fallow land biomass + + + + + in the end) × CF + + + Fallow land -> cleared land + In previously cultivated land, fallow land became + (Fallow land + + + + cleared land. + biomass -cleared land + + + + + biomass) × CF + + + Cleared land -> fallow land + In previously cultivated land, cleared land became + (Cleared land + + + + fallow land. + biomass -fallow land + + + + + biomass) × CF + + + Cleared land -> cleared land + In previously cultivated land, cleared land became + + + + + cleared land. + + +
+ +

+ Zeroand other activity classes in table 1 were estimated from the maps.This is because the samplingbased area estimates of New Shifting Cultivation adjusted errors in mapping and are more accurate than pixel-counting from the maps(Olofsson et al 2013(Olofsson et al , 2014)). + The area estimates of New Shifting Cultivation were calculated by 5-year periods with low uncertainty. + For other activity classes, it is difficult to get area estimates from the reference data while including the dynamics of biomass of fallow land, and thus we used a spatially explicit method. + In table 1, the biomass of the forest before disturbance was the biomass of Intact Forest estimated from GEDI. + The biomass of fallow land was estimated from the growth curve developed from GEDI based on years since disturbance. + Years since disturbance for each pixel was obtained from the annual maps of shifting cultivation. + The cleared land biomass was estimated as the biomass of non-forest by the Department of Forestry (2020) based on field surveys. + The emission factor of New Shifting Cultivation is 75.95 + Mg C ha -1 . + The emission factor of Cleared land -> Cleared land is zero. +

+
+
+
+ Table 2 . + + +
+

+ Area of difference land use activities for each period (5 years). +

+
+
+ + + Area (ha) + 2001-2005 + 2006-2010 + 2011-2015 + 2016-2020 + + + Fallow land -> fallow land + 2379 847 + 3809 008 + 5213 561 + 6009 880 + + + Fallow land -> cleared land + 226 240 + 361 992 + 397 236 + 630 467 + + + Cleared land -> fallow land + 441 757 + 768 342 + 748 692 + 696 501 + + + New shifting cultivation + 1198 106 + 806 418 + 714 256 + 1036 823 + +
+
+
+ Table 3 . + + +
+

+ The country-average emissions or removal factors for each period (5 years). + The original emission or removal factors except for new shifting cultivation are spatially explicit. + This table shows the country averages of the spatial explicit emission or removal factors. +

+
+
+ + + Average emission/removal factors (Mg C/ha -1 ) + 2001-2005 + 2006-2010 + 2011-2015 + 2016-2020 + + + Fallow land -> fallow land + -8.06 + -7.57 + -5.65 + -1.56 + + + Fallow land -> cleared land + 18.70 + 19.26 + 23.58 + 26.10 + + + Cleared land -> fallow land + -23.14 + -24.28 + -23.67 + -21.36 + + + New shifting cultivation + 75.95 + 75.95 + 75.95 + 75.95 + +
+
+
+ Table 4 . + + +
+

+ Carbon emissions (+) and removals (-) of different activities for each period (5 years). +

+
+
+ + + Carbon emission/removal + + + + + + + (Mg C) + 2001-2005 + 2006-2010 + 2011-2015 + 2016-2020 + + + Fallow land -> fallow + -19 175 009 + -28 833 216 + -29 440 602 + -9348 118 + + + land + + + + + + + Fallow land -> cleared + 4230 290 + 6970 956 + 9366 236 + 16 452 893 + + + land + + + + + + + Cleared land -> fallow + -10 222 046 + -18 657 539 + -17 717 827 + -14 879 752 + + + land + + + + + + + New shifting cultivation + 90 996 151 + 61 247 409 + 54 247 705 + 78 746 669 + + + Period total (net + 65 829 387 + 20 727 610 + 16 455 512 + 70 971 692 + + + emission/removal) + + + + + + + Annual average + 13 165 877 + 4145 522 + 3291 103 + 14 194 339 + +
+
+
+ +
+
Table 2 .Area of difference land use activities for each period (5 years).Area (ha)2001-20052006-20102011-20152016-2020Fallow land -> fallow land2379 8473809 0085213 5616009 880Fallow land -> cleared land226 240361 992397 236630 467Cleared land -> fallow land441 757768 342748 692696 501New shifting cultivation1198 106806 418714 2561036 823
+
Table 3 .The country-average emissions or removal factors for each period (5 years). The original emission or removal factors except for new shifting cultivation are spatially explicit. This table shows the country averages of the spatial explicit emission or removal factors.Average emission/removal factors (Mg C/ha -1 )2001-20052006-20102011-20152016-2020Fallow land -> fallow land-8.06-7.57-5.65-1.56Fallow land -> cleared land18.7019.2623.5826.10Cleared land -> fallow land-23.14-24.28-23.67-21.36New shifting cultivation75.9575.9575.9575.95
+
Table 4 .Carbon emissions (+) and removals (-) of different activities for each period (5 years).Carbon emission/removal(Mg C)2001-20052006-20102011-20152016-2020Fallow land -> fallow-19 175 009-28 833 216-29 440 602-9348 118landFallow land -> cleared4230 2906970 9569366 23616 452 893landCleared land -> fallow-10 222 046-18 657 539-17 717 827-14 879 752landNew shifting cultivation90 996 15161 247 40954 247 70578 746 669Period total (net65 829 38720 727 61016 455 51270 971 692emission/removal)Annual average13 165 8774145 5223291 10314 194 339
+
+
Table 4 .

Evaluation

basebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher).Each person corrected 10 documents.

+
Table 3 .

Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).

MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .

Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).

ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .

Evaluation scores obtained for each document and method (I: interface, P: PDF) combination.TP: true positive, FP: false positive, FN: false negative.P: precision, R: recall, F1: F1-score.

Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 2 L. FOPPIANO et al.

+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 3 L. FOPPIANO et al.

+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 5 L. FOPPIANO et al.

+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 6 L. FOPPIANO et al.

+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 9L.FOPPIANO et al.

+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 10 L. FOPPIANO et al.

+

Sci. Technol.Adv.Mater.Meth. 3 (2023) 12 L. FOPPIANO et al.

+ + + +
+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

+
+
+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+
+
+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
+ + + 305-0044 + + +
+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing.LF and POS discussed the ML results and experiments.LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface.LF designed the user interface experiment with KT, TT and WS as curators.KT led the materials-science work on the data with CS, TT and WS.KT, TA, YT and MI revised the paper.YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1.Timetable recording the time spent for each of the 15 articles.Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR).Duration is expressed in minutes.

+
+ + + + + + Automatic extraction of materials and properties from superconductors scientific literature + + LFoppiano + + + PBCastro + + + POSuarez + + 10.1080/27660400.2022.2153633 + + + Sci Technol Adv Mater + + 3 + 1 + 2153633 + 2023 + + + + + + + Materials discovery with machine learning and knowledge discovery + + ONOliveira + + + MJOliveira + + 10.3389/fchem.2022.930369 + + + Front Chem + + 10 + 10 + 2022 + + + + + + + Commentary: the materials project: a materials genome approach to accelerating materials innovation + + AJain + + + SPOng + + + GHautier + + 10.1063/1.4812323 + + + APL Mater + + 1 + 1 + 11002 + 2013 + + + + + + + Aflow: an automatic framework for high-throughput materials discovery + + SCurtarolo + + + WSetyawan + + + GLHart + + + + + Comput Mater Sci + + 58 + + 2012 + + + + + + + The nomad laboratory: from data sharing to artificial intelligence + + CDraxl + + + MScheffler + + 10.1088/2515-7639/ab13bb + + + J Phys Mater + + 2 + 3 + 36001 + 2019 + + + + + + + Global publication productivity in materials science research: a scientometric analysis + + TPratheepan + + + + + Indian J Inf Sources Serv + + 9 + 1 + + 2019 Feb + + + + + + + The PAULING FILE project and materials platform for data science: from big data toward materials genome + + EBlokhin + + + PVillars + + 10.1007/978-3-319-42913-7_62-1 + + 2018 + Springer International Publishing + + Cham + + + + + + + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + + MIshii + + + KSakamoto + + 10.1080/27660400.2023.2223051 + + + Sci Technol Adv Mater + + 3 + 1 + 2223051 + 2023 + + + + + + + Predicting new superconductors and their critical temperatures using machine learning + + BRoter + + + SDordevic + + 10.1016/j.physc.2020.1353689 + + + Phys C + + 575 + 1353689 + 2020 + + + + + + + Machine learning modeling of superconducting critical temperature + + VStanev + + + COses + + + AKusne + + 10.1038/s41524-018-0085-8 + + + Npj Comput Mater + + 4 + 1 + 4 + 2017 + + + + + + + Machine-learning approach for discovery of conventional superconductors + + HTran + + + TNVu + + arXiv:221103265. 2022 + + + arXiv preprint + + + + + Deep learning model for finding new superconductors + + TKonno + + + HKurokawa + + + FNabeshima + + 10.1103/PhysRevB.103.014509 + + + Phys Rev B + + 103 + 1 + 14509 + 2021 + + + + + + + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + + JCKlie + + + MBugert + + + BBoullosa + + + + + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
+ + 2018 + + +
+
+ + + + Doccano: text annotation tool for human + + HNakayama + + + TKubo + + + JKamura + + + + + Software + + 2018 + + + + + + + Python materials genomics pymatgen: a robust open-source python library for materials analysis + + SPOng + + + WDRichards + + + AJain + + 10.1016/j.commatsci.2012.10.028 + + + Comput Mater Sci + + 68 + 2 + + 2013 + + + + + + + Text-mined dataset of inorganic materials synthesis recipes. Sci Data + + OKononova + + + HHuo + + + THe + + 10.1038/s41597-019-0224-1 + 41597-019-0224-1 + + + 2019 Oct + 6 + 203 + + + + + + + Label studio: data labeling software; 2020-2022 + + MTkachenko + + + MMalyuk + + + AHolmanyuk + + + + + Open source software + + + + + + + Supermat: construction of a linked annotated dataset from superconductors-related publications + + LFoppiano + + + SDieb + + + ASuzuki + + 10.1080/27660400.2021.1918396 + + + Sci Technol Adv Mater: Methods + + 1 + 1 + + 2021 + + + + + + + SciBERT: a pretrained language model for scientific text + + IBeltagy + + + KLo + + + ACohan + + + + + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
+ + Association for Computational Linguistics + Nov. 2019 + + +
+
+ + + + + <ptr target="https://github.com/kermitt2/delft"/> + </analytic> + <monogr> + <title level="j">DeLFT contributors. Delft + + 2018-2023 + + + + + + + Overcoming catastrophic forgetting in neural networks + + JKirkpatrick + + + RPascanu + + + NCRabinowitz + + abs/1612.00796 + + + + CoRr + + 2016 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Contributors</surname></persName> + </author> + <author> + <persName><surname>Grobid</surname></persName> + </author> + <ptr target="https://github.com/kermitt2/grobid"/> + <imprint> + <date type="published" when="2008">2008 -2023</date> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> \ No newline at end of file diff --git a/src/test/resources/org/pub2tei/document/document2.tei.xml b/src/test/resources/org/pub2tei/document/document2.tei.xml new file mode 100644 index 0000000..2ab3daa --- /dev/null +++ b/src/test/resources/org/pub2tei/document/document2.tei.xml @@ -0,0 +1,760 @@ +<?xml version="1.0" encoding="UTF-8"?> +<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd" xmlns:xlink="http://www.w3.org/1999/xlink"> + <teiHeader xml:lang="en"> + <fileDesc> + <titleStmt> + <title level="a" type="main">Science and Technology of Advanced Materials: Methods + + MEXT + + + unknown + + + Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) + + + + + + + + 14 Dec 2023. + + + + + + LucaFoppiano + 0000-0002-6114-6164 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+ + Knowledge and Data Engineering + Centre for Computational Sciences + University of Tsukuba +
+ Tsukuba + Japan; +
+
+
+ + TomoyaMato + 0000-0002-0918-6468 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+
+ + KenseiTerashima + 0000-0003-0375-3043 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + PedroOrtiz Suarez + 0000-0003-0343-8852 + + GmbH DFKI + CONTACT Luca Foppiano +
+ Luca Foppiano http://orcid.org/0000-0002-6114-6164 Tomoya Mato http://orcid.org/0000-0002-0918-6468 Kensei Terashima http://orcid.org 3043 Pedro Ortiz Suarez http://orcid.org/0000-0003-0343- 8852 Wei-Sheng Wang http://orcid.org/0009-0001-3572-5736 Toshiyuki Amagasa http://orcid.org/0000-0003-0595- 2230 Yoshihiko Takano http://orcid.org/0000-0002-1541- 6928 Masashi Ishii + 0000-0003-0375 + Berlin + DE +
+
+
+ + TakuTou + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + ChikakoSakai + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + Wei-ShengWang + 0009-0001-3572-5736 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + ToshiyukiAmagasa + 0000-0003-0595-2230 + + Knowledge and Data Engineering + Centre for Computational Sciences + University of Tsukuba +
+ Tsukuba + Japan; +
+
+
+ + YoshihikoTakano + 0000-0002-1541-6928 + + Frontier Superconducting Materials Group + MANA + NIMS +
+ Tsukuba + Japan; +
+
+
+ + MasashiIshii + ishii.masashi@nims.go.jp + 0000-0003-0357-2832 + + Materials Modelling Group + Centre for Basic Research on Materials + Data-driven Materials Research Field + NIMS +
+ Tsukuba + Japan; +
+
+
+ + Masashi + + Science and Technology of Advanced Materials: Methods +
+ + Print + + 14 Dec 2023. + + + DCB0425EE18794E34CC3A3075E3E3975 + 10.1080/27660400.2023.2286219 + Received 8 September 2023 Revised 9 November 2023 Accepted 16 November 2023 +
+
+ + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + Materials informatics + superconductors + machine learning + database + TDM + + + +

We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.

+
IMPACT STATEMENT

This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.

+
+
+ + + +
Introduction

The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.

On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.

The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.

In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.

Our contributions can be summarised as follows:

• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.

+
Curation workflow

The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.

Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).

Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).

+
Workflow control

The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).

+
Curation status

The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:

• new: default status when a new record is created.

• curated: the record has been amended manually.

• validated: the record was manually marked as valid.

• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).

• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).

+
Error types

We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).

• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.

+
Anomaly detection

Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.

The anomaly detection uses a rule-based approach and marks any record that matches the following conditions

• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.

Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.

We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.

+
Automatic training data collector

The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.

+
Training data collection

In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.

+
Training data management

We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).

+
Curation interface

The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].

During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).

+
Manual curation approach

In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.

We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.

+
Curation guidelines

The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).

Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.

+
Curation and processing logs

The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).

The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).

+
Results and evaluation

In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.

+
Anomaly detection rejection rate

We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.

+
Training data generation

We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.

We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:

(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).

The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.

This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.

+
Data quality

We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).

We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.

We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.

+
Discussion

Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.

We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.

The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).

Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.

The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.

+
Code availability

This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.

+
Conclusions

We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.

+
Notes

1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.

Figure 1 .Figure 1. Schema of the curation workflow. Each node has two properties: type and status (Section 2.1.1). Each edge indicates one action. The workflow starts on the left side of the figure. The new records begin with 'automatic, new'. Changes of state are triggered by automatic (Section 2.2) or manual operations (update, mark as valid, etc. Section 3.1) and results in changes of the properties in the node. Each combination of property values identifies each state. '(*)' indicates a transition for which the training data are collected (Section 2.3).
+
Figure 2 .Figure 2. Screenshot of the training data management page in the SuperCon 2 interface. Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio[17]. The column 'status' indicate whether the example has been sent or not to the external tool.
+
Figure 3 .Figure 3. Screenshot of SuperCon 2 interface showing the database. Each row corresponds to one material-T c pair. On top, there are searches by attribute, sorting and other filtering operations. On the right there are curation controls (mark as valid, update, etc.). Records are grouped by document with alternating light yellow and white.
+
Figure 5 .Figure 5. Sample curation sheet from the curation guidelines. The sheet is composed of the following information: (a) Sample input data: a screenshot of the record from the 'SuperCon 2 interface', (b) Context represented by the related part of the annotated document referring to the record in exams. (c) The Motivation, describing the issue, (d) The Action to be taken, and the expected output.
+
Figure 4 .Figure 4. PDF document viewer showing an annotated document. The table on top is linked through the annotated entities. The user can navigate from the record to the exact point in the PDF, with a pointer (the red bulb light) identifying the context of the entities being examined.
+
( 1 )base(s): using the base dataset and training from scratch (s). (2) (base+curation)(s): using both the base and curation datasets and training from scratch (s). (3) base(s)+(base+curation)(i): Using the base dataset to train from scratch (s), and then continuing the training with the curation dataset (i).
+
Figure 6 .Figure 6. Top: Processing log, showing the output of each ingestion operation and the outcome with the detailed error that may have occurred. Bottom: Correction log, indicating each record, the number of updates, and the date/time of the last updates. By clicking on the 'record id', is possible to visualise the latest record values.
+
+
c classification: The temperature is not correctly classified
+
Table 4 .Evaluationbasebase+curationΔ<class>1646173286<material>69437580637<me_method>1883193451<pressure>27436187<tc>37414269528<tcValue>10991556457Total15586174321846

scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.

+
Table 3 .Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).MethodP (%)R (%)F1%)# docsPDF document87.8345.6152.6715Interface93.3892.5192.0215
+
Table 5 .Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).ExperienceMethodP (%)R (%)F1%)# docs# pagesMSPDF Document94.5836.5548.67646Interface83.1995.8388.25450PDPDF Document70.0048.5150.78549Interface96.6782.8688.11551SRPDF Document100.0055.5661.03451Interface97.4298.3397.78645
+
Table A2 .Evaluation scores obtained for each document and method (I: interface, P: PDF) combination. TP: true positive, FP: false positive, FN: false negative. P: precision, R: recall, F1: F1-score.Document ID# pagesMethod# TP# FP# FNPRF1Senior Researcher (SR)0454e07f644I600100.00100.00100.0000c32076f413P800100.00100.00100.000c7d3163ea9I131092.86100.0096.300da5febabf11P801100.0088.8994.12001233358113I1100100.00100.00100.000aa1b3161f5I901100.0090.0094.740021fd339f14P408100.0033.3350.00039105663f9I111091.67100.0095.6502c4f0012713P003100.000.000.00021c4131725I1500100.00100.00100.00PhD Student (PS)02bf1b3db97I502100.0071.4383.3300b50fc0a811P207100.0022.2236.3602cbc588194I403100.0057.1472.73044939701d12P402100.0066.6780.0008e1cb8f4f16I51183.3385.7184.510454e07f644P0150.0016.670.0000c32076f413I800100.00100.00100.000c7d3163ea9P905100.0064.2978.260da5febabf11I900100.00100.00100.00001233358113P44350.0072.7359.26Master Student (MS)0aa1b3161f5P109100.0010.0018.180021fd339f14I123380.00100.0088.89039105663f9P41780.0041.6754.7902c4f0012713I31175.00100.0085.71021c4131725P71787.5053.3366.2702bf1b3db97P205100.0028.5744.4400b50fc0a811I72077.78100.0087.5002cbc588194P502100.0071.4383.33044939701d12I501100.0083.3390.9108e1cb8f4f16P106100.0014.2925.00
+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.

+

Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.

+ + + +
+
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

+
+
+

Materials Modelling Group, Data-driven Materials Research Field, Centre for Basic Research on Materials, NIMS, 1-1 Namiki, Tsukuba, Ibaraki 305-0044, Japan

+
+
+
Funding

This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503].

+
+ + + 305-0044 + + +
+
Disclosure statement

No potential conflict of interest was reported by the author(s).

+
Author contribution

LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.

+
Appendix A. Evaluation

Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.

+
+ + + + + + Automatic extraction of materials and properties from superconductors scientific literature + + LFoppiano + + + PBCastro + + + POSuarez + + 10.1080/27660400.2022.2153633 + + + Sci Technol Adv Mater + + 3 + 1 + 2153633 + 2023 + + + + + + + Materials discovery with machine learning and knowledge discovery + + ONOliveira + + + MJOliveira + + 10.3389/fchem.2022.930369 + + + Front Chem + + 10 + 10 + 2022 + + + + + + + Commentary: the materials project: a materials genome approach to accelerating materials innovation + + AJain + + + SPOng + + + GHautier + + 10.1063/1.4812323 + + + APL Mater + + 1 + 1 + 11002 + 2013 + + + + + + + Aflow: an automatic framework for high-throughput materials discovery + + SCurtarolo + + + WSetyawan + + + GLHart + + + + + Comput Mater Sci + + 58 + + 2012 + + + + + + + The nomad laboratory: from data sharing to artificial intelligence + + CDraxl + + + MScheffler + + 10.1088/2515-7639/ab13bb + + + J Phys Mater + + 2 + 3 + 36001 + 2019 + + + + + + + Global publication productivity in materials science research: a scientometric analysis + + TPratheepan + + + + + Indian J Inf Sources Serv + + 9 + 1 + + 2019 Feb + + + + + + + The PAULING FILE project and materials platform for data science: from big data toward materials genome + + EBlokhin + + + PVillars + + 10.1007/978-3-319-42913-7_62-1 + + 2018 + Springer International Publishing + + Cham + + + + + + + Structuring superconductor data with ontology: reproducing historical datasets as knowledge bases + + MIshii + + + KSakamoto + + 10.1080/27660400.2023.2223051 + + + Sci Technol Adv Mater + + 3 + 1 + 2223051 + 2023 + + + + + + + Predicting new superconductors and their critical temperatures using machine learning + + BRoter + + + SDordevic + + 10.1016/j.physc.2020.1353689 + + + Phys C + + 575 + 1353689 + 2020 + + + + + + + Machine learning modeling of superconducting critical temperature + + VStanev + + + COses + + + AKusne + + 10.1038/s41524-018-0085-8 + + + Npj Comput Mater + + 4 + 1 + 4 + 2017 + + + + + + + Machine-learning approach for discovery of conventional superconductors + + HTran + + + TNVu + + arXiv:221103265. 2022 + + + arXiv preprint + + + + + Deep learning model for finding new superconductors + + TKonno + + + HKurokawa + + + FNabeshima + + 10.1103/PhysRevB.103.014509 + + + Phys Rev B + + 103 + 1 + 14509 + 2021 + + + + + + + The INCEpTION platform: machine-assisted and knowledge-oriented interactive annotation + + JCKlie + + + MBugert + + + BBoullosa + + + + + Proceedings of the 27th International Conference on Computational Linguistics: System Demonstrations + the 27th International Conference on Computational Linguistics: System Demonstrations
Santa Fe, New Mexico
+ + 2018 + + +
+
+ + + + Doccano: text annotation tool for human + + HNakayama + + + TKubo + + + JKamura + + + + + Software + + 2018 + + + + + + + Python materials genomics pymatgen: a robust open-source python library for materials analysis + + SPOng + + + WDRichards + + + AJain + + 10.1016/j.commatsci.2012.10.028 + + + Comput Mater Sci + + 68 + 2 + + 2013 + + + + + + + Text-mined dataset of inorganic materials synthesis recipes. Sci Data + + OKononova + + + HHuo + + + THe + + 10.1038/s41597-019-0224-1 + 41597-019-0224-1 + + + 2019 Oct + 6 + 203 + + + + + + + Label studio: data labeling software; 2020-2022 + + MTkachenko + + + MMalyuk + + + AHolmanyuk + + + + + Open source software + + + + + + + Supermat: construction of a linked annotated dataset from superconductors-related publications + + LFoppiano + + + SDieb + + + ASuzuki + + 10.1080/27660400.2021.1918396 + + + Sci Technol Adv Mater: Methods + + 1 + 1 + + 2021 + + + + + + + SciBERT: a pretrained language model for scientific text + + IBeltagy + + + KLo + + + ACohan + + + + + Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing + the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing
Hong Kong; China
+ + Association for Computational Linguistics + Nov. 2019 + + +
+
+ + + + + <ptr target="https://github.com/kermitt2/delft"/> + </analytic> + <monogr> + <title level="j">DeLFT contributors. Delft + + 2018-2023 + + + + + + + Overcoming catastrophic forgetting in neural networks + + JKirkpatrick + + + RPascanu + + + NCRabinowitz + + abs/1612.00796 + + + + CoRr + + 2016 + + + + + + + + <author> + <persName><forename type="first">G</forename><surname>Contributors</surname></persName> + </author> + <author> + <persName><surname>Grobid</surname></persName> + </author> + <ptr target="https://github.com/kermitt2/grobid"/> + <imprint> + <date type="published" when="2008">2008 -2023</date> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> \ No newline at end of file From 88bf63bd4a99a3843b03802502ac57bd096cf954 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Wed, 8 May 2024 16:35:23 +0900 Subject: [PATCH 2/7] add build --- .github/workflows/ci-build-manual.yml | 47 +++++++++++++++++++++++++++ .github/workflows/ci-build.yml | 27 +++++++++++++++ Dockerfile | 2 +- 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci-build-manual.yml create mode 100644 .github/workflows/ci-build.yml diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml new file mode 100644 index 0000000..cf8f2c5 --- /dev/null +++ b/.github/workflows/ci-build-manual.yml @@ -0,0 +1,47 @@ +name: Build and push a development version on docker + +on: + workflow_dispatch: + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /opt/hostedtoolcache + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/Pub2TEI + registry: docker.io + pushImage: true + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml new file mode 100644 index 0000000..1912505 --- /dev/null +++ b/.github/workflows/ci-build.yml @@ -0,0 +1,27 @@ +name: Build unstable + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout grobid home + uses: actions/checkout@v4 + with: + repository: kermitt2/grobid + path: ./grobid + - name: Checkout Pub2TEI + uses: actions/checkout@v4 + with: + path: ./grobid/Pub2TEI + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build and run integration tests + working-directory: ./grobid/Pub2TEI + run: ./gradlew test diff --git a/Dockerfile b/Dockerfile index 8b45163..fcc911a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ ## Docker Pub2TEI image using Grobid deep learning models and/or CRF models for transformation enhancements # this is the full GROBID image using NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine -FROM grobid/grobid:0.8.0 +FROM lfoppiano/grobid:0.8.0-full-slim # Add Tini ENV TINI_VERSION v0.19.0 From 911669fb43268467c3777a34f7a5aa55317630e9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Wed, 8 May 2024 16:49:42 +0900 Subject: [PATCH 3/7] more verbosity in case of failures --- .github/workflows/ci-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 1912505..db1e867 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -24,4 +24,4 @@ jobs: cache: 'gradle' - name: Build and run integration tests working-directory: ./grobid/Pub2TEI - run: ./gradlew test + run: ./gradlew test --stacktrace --info From 7d29e8244c68803a0c27966a8b21c0c4f824cff9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Wed, 8 May 2024 16:56:57 +0900 Subject: [PATCH 4/7] add setting file --- settings.gradle | 1 + 1 file changed, 1 insertion(+) create mode 100644 settings.gradle diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 0000000..ef26ca0 --- /dev/null +++ b/settings.gradle @@ -0,0 +1 @@ +rootProject.name = "Pub2TEI" From a9a206156415c07427e22b32372d1562b8577a04 Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Wed, 8 May 2024 17:08:30 +0900 Subject: [PATCH 5/7] Fix java version --- .github/workflows/ci-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index db1e867..3bd4340 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -16,10 +16,10 @@ jobs: uses: actions/checkout@v4 with: path: ./grobid/Pub2TEI - - name: Set up JDK 17 + - name: Set up JDK 11 uses: actions/setup-java@v4 with: - java-version: '17.0.10+7' + java-version: '11' distribution: 'temurin' cache: 'gradle' - name: Build and run integration tests From 42c470e3b20a51bac30c3eefec1ff72e07acc47b Mon Sep 17 00:00:00 2001 From: Luca Foppiano <Foppiano.Luca@nims.go.jp> Date: Wed, 8 May 2024 17:13:01 +0900 Subject: [PATCH 6/7] Fix java version --- .github/workflows/ci-build-manual.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml index cf8f2c5..21a5254 100644 --- a/.github/workflows/ci-build-manual.yml +++ b/.github/workflows/ci-build-manual.yml @@ -10,10 +10,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up JDK 17 + - name: Set up JDK 11 uses: actions/setup-java@v4 with: - java-version: '17.0.10+7' + java-version: '11' distribution: 'temurin' cache: 'gradle' - name: Build with Gradle From b85dd4fede04de43dbb59b7af3a4225b17f32ddb Mon Sep 17 00:00:00 2001 From: Luca Foppiano <luca@foppiano.org> Date: Fri, 6 Sep 2024 17:25:40 +0200 Subject: [PATCH 7/7] Avoid calling serialize twice, avoid serializing empty nodes --- src/main/java/org/pub2tei/document/XMLUtilities.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java index 845f5e6..253c9ec 100644 --- a/src/main/java/org/pub2tei/document/XMLUtilities.java +++ b/src/main/java/org/pub2tei/document/XMLUtilities.java @@ -235,15 +235,16 @@ public static void segment(org.w3c.dom.Document doc, Node node) { NodeList childNodes = n.getChildNodes(); for(int y=0; y<childNodes.getLength(); y++) { Node item = childNodes.item(y); - if (y > 0) { - String firstChar = "" + serialize(doc, item).charAt(0); + String serializedString = serialize(doc, item); + if (y > 0 && StringUtils.isNotEmpty(serializedString)) { + String firstChar = "" + serializedString.charAt(0); //We might need to use TextUtilities.fullPunctuation if (!Pattern.matches("\\p{Punct}", firstChar)) { textBuffer.append(" "); } } - textBuffer.append(serialize(doc, item)); + textBuffer.append(serializedString); } String text = textBuffer.toString(); List<OffsetPosition> theSentenceBoundaries = SentenceUtilities.getInstance().runSentenceDetection(text);