diff --git a/build.gradle b/build.gradle
index fc8e3f2..8978717 100644
--- a/build.gradle
+++ b/build.gradle
@@ -123,6 +123,9 @@ dependencies {
implementation "org.apache.lucene:lucene-analyzers-common:4.5.1"
implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0'
+ testImplementation "org.xmlunit:xmlunit-matchers:2.10.0"
+ testImplementation "org.xmlunit:xmlunit-legacy:2.10.0"
+
implementation 'org.slf4j:slf4j-api:1.7.30'
implementation 'ch.qos.logback:logback-classic:1.2.3'
implementation "com.rockymadden.stringmetric:stringmetric-core_2.10:0.27.3"
diff --git a/src/main/java/org/pub2tei/document/XMLUtilities.java b/src/main/java/org/pub2tei/document/XMLUtilities.java
index 9b431a5..253c9ec 100644
--- a/src/main/java/org/pub2tei/document/XMLUtilities.java
+++ b/src/main/java/org/pub2tei/document/XMLUtilities.java
@@ -1,26 +1,35 @@
package org.pub2tei.document;
-import java.io.*;
-import java.util.*;
-import javax.xml.parsers.*;
-import javax.xml.transform.*;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-import javax.xml.namespace.NamespaceContext;
-import javax.xml.xpath.*;
-
import net.sf.saxon.om.NameChecker;
-
-import org.w3c.dom.*;
-import org.xml.sax.*;
-
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.SentenceUtilities;
-
-import org.apache.commons.io.FileUtils;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.w3c.dom.*;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXParseException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Pattern;
/**
* Some convenient methods for suffering a bit less with XML
@@ -222,11 +231,20 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
(textualElements.contains(n.getNodeName())) ) {
// text content
- StringBuffer textBuffer = new StringBuffer();
+ StringBuilder textBuffer = new StringBuilder();
NodeList childNodes = n.getChildNodes();
for(int y=0; y Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript. \n" +
+ "\t\t\t
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This research was funded by the
+
+ Although shifting cultivation is the major land use type in Laos, the spatial-temporal patterns and the associated carbon emissions of shifting cultivation in Laos are largely unknown. This study provides a nationwide analysis of the spatial-temporal patterns of shifting cultivation and estimations of the associated carbon emissions in Laos over the last three decades. This study found that shifting cultivation has been expanding and intensifying in Laos, especially in the last 5 years. The newly cultivated land from 2016 to 2020 accounted for 4.5% (±1.2%) of the total land area of Laos. Furthermore, the length of fallow periods has been continuously declining, indicating that shifting cultivation is becoming increasingly intensive. Combining biomass derived from Global Ecosystem Dynamics Investigation and shifting cultivation maps and area estimates, we found that the net carbon emissions from shifting cultivation declined in 2001-2015 but increased in 2016-2020. The largest carbon source is conversion from intact forests to shifting cultivation, which contributed to 89% of the total emissions from 2001 to 2020. In addition, there were increased emissions from intensified use of fallow lands. This research provides useful information for policymakers in Laos to understand the changes in shifting cultivation and improve land use management. This study not only supports Reducing Emissions from Deforestation and Forest Degradation reporting for Laos but also provides a methodology for tracking carbon emissions and removals of shifting cultivation. Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation. It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation. After short-term cultivation, the plot is abandoned, which allows the vegetation to recover. Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017, Curtis et al 2018, Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015). Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small. Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce. Shifting cultivation has both short-term and long-term effects on carbon emissions (Ziegler et al 2012). In the short term, the slash-and-burn activities cause immediate release of carbon. In the long term, encroachment of shifting cultivation into primary forest and intensified use of secondary forest both lead to long-term increases in net carbon emissions and degradation of ecosystems. Carbon emissions from shifting cultivation have not been well quantified, because of the lack of methodology for monitoring shifting cultivation and tracking the associated carbon dynamics. In contrast to deforestation (such as urbanization), which does not involve carbon sequestration, shifting cultivation involves both carbon emissions associated with slash-and-burn activities and carbon sequestration during the fallow period. Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting. In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001, Douangsavanh et al 2006, Epprecht et al 2018, Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018, Chen et al 2023). It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years (Chen et al 2023). Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020. Shifting cultivation activities are expected to increase due to the increasing demand for rice. Monitoring shifting cultivation and analyzing its patterns are important to understand the forest cover change in Laos and relevant to achieving Laos' goal of increasing forest cover to 70% (The current forest cover is 62%) (The Government of Lao PDR 2005). Since there were few spatially and temporally explicit maps and estimates of shifting cultivation before Chen et al (2023), carbon emissions from shifting cultivation have not been accurately estimated in the REDD+ reporting of Laos (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018). Spatially and temporally explicit information about shifting cultivation in Laos was unavailable until recently (Chen et al 2023), and a comprehensive national-scale analysis of the spatial and temporal patterns of shifting cultivation has not been conducted to date. A traditional approach for mapping shifting cultivation is to create landscape mosaics based on a land cover map of a single year (Messerli et al 2009, Silva et al 2011, Hett et al 2012, Hurni et al 2013a). It is impossible to analyze the temporal patterns of shifting cultivation using this traditional approach. Another approach is to use multi-temporal land cover data to map shifting cultivation (Leisz and Rasmussen 2012, Molinario et al 2015, Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018, Adhikary et al 2019, Kurien et al 2019). In previous studies, the temporal resolution of the land cover maps was not high enough to support the analysis of temporal patterns (Heinimann et al 2013). Recently, Chen et al (2023) used satellite data to create shifting cultivation products for Laos with sufficient temporal frequency (annual) and spatial resolution (30 m) to support a nationalscale spatial-temporal analysis. The recently launched GEDI (Global Ecosystem Dynamics Investigation) mission provides new opportunities for estimating biomass at a large scale (Tang et al 2020). This study used the map products and reference data in Chen et al (2023), combined with GEDI, to conduct a national-scale analysis of the spatial and temporal patterns and carbon dynamics of shifting cultivation in Laos. The goal is to understand the spatial and temporal patterns of shifting cultivation and the associated carbon emissions, in support of decision-making to reduce carbon emissions and promote sustainable livelihoods depending on shifting cultivation. Shifting cultivation was mapped using Landsat data from 1987 to 2020 on Google Earth Engine (Chen et al 2023). CCDC-SMA (continuous change detection and classification-spectral mixture analysis) (Zhu and Woodcock 2014, Bullock et al 2020, Chen et al 2021) was used to detect forest disturbances in Laos. CCDC-SMA fits harmonic models to fractions of endmembers and NDFI (Normalized Difference Fraction Index) (Souza et al 2005) to monitor forest disturbances (figure 1). Annual maps of Shifting Cultivation from 1991 to 2020 were created by combining time series analysis, object-based image analysis, and post-disturbed land-cover classification. A total of 1000 sample units under simple random sampling were used as reference data for accuracy assessment and area estimation. For each sample unit, at least two interpreters interpreted the land change class and the year of each slash-and-burn event by examining high-resolution satellite imagery and Landsat time series (figures 1(a), (c) and 2). During 1991-2020, shifting cultivation was the main type of forest disturbance in Laos, affecting 32.9 ± 1.9% of Laos (Chen et al 2023). Shifting cultivation was mapped with a producer's accuracy of 88% and a user's accuracy of 80% (Chen et al 2023). Chen et al (2023) describes more details of the monitoring method. Both the map products and the reference data from Chen et al (2023) were used in this study. The annual maps of shifting cultivation and the reference sample units interpreted as Shifting Cultivation were used to investigate the patterns of shifting cultivation. We estimated the area of shifting cultivation at different fallow and disturbance 55 ′ 27 ′′ E. In the time series plot, the blue points are the Landsat observations. In the Landsat images (Red-green-blue), the yellow squares show the pixel location. In the high-resolution image, the white point shows the pixel location.). to explore whether the extent of shifting cultivation expanded, the newly and previously cultivated areas of shifting cultivation were estimated using reference sample units and maps in Chen et al (2023) for every 5 year period from 2000 to 2020. In the reference sample points, whether a pixel is newly or previously cultivated is determined by the year of slash-andburn recorded by the interpreters (e.g. figures 1(a), (c) and 2). Furthermore, to investigate the change patterns in fallow length and cultivation length (length of cropping period), we visually interpreted Landsat time series, Landsat imagery, and high-resolution images for 196 sample points (figure 3 as an example). These sample points are the points with at least two cultivation events in the aforementioned reference data with 1000 simple random sample points. For each point, the year of slash and burn (land clearing), cultivation length, and fallow length are recorded for every event. The GEDI mission provides space-borne LiDAR data to estimate aboveground biomass (Healey et al 2020). GEDI's L4A Footprint Level Aboveground Biomass Density (AGBD) (version 2.1) 25 m data (Beck et al 2020, Dubayah et al 2022) were used to explore the effect of shifting cultivation on biomass. GEDI data collected in 2020 was used because it was the only year of data with good spatial coverage when the study was conducted. To overlay the GEDI footprint and Landsat, for each GEDI footprint, we extracted the value of the 30 m pixel in the Landsat-based map that has the largest overlap with the 25 m footprint. Only lidar observations with good quality (using the 'quality_flag' band and the 'degrade_flag' band) and collected at places with a slope less than 20 • and in the interior of shifting cultivation sites (excluding a two-pixel edge) were used, to eliminate the effect of terrain and possible misregistration at the edges of slash-and-burn events. The reason why we excluded lidar points with slopes larger than 20 • is that GEDIbased biomass estimates tend to be overestimated at steep terrain. AGBD was calculated for Active Shifting Cultivation, Inactive Shifting Cultivation, Intact Forest, and Others. Intact Forest here is defined as forests without significant anthropogenic disturbances. The relationship between AGBD and years of regrowth since the latest slash-and-burn events was analyzed. The hypothesis was that AGBD has a positive relationship with years of regrowth since the latest slash-andburn activity. From this relationship, a country-level growth curve of AGBD can be developed and used to estimate the biomass of fallow lands. Carbon emissions from shifting cultivation were estimated for every 5 year period from 2001 to 2020. Table 1 shows the activity classes, definitions, and emission factors. New Shifting Cultivation area was estimated from a sampling-based method The emission factors for activities other than New Shifting Cultivation are spatially explicit and were determined by the map of the latest year of slash and burn and the growth curve. Figure 8 shows an example of the spatially explicit emission factors for different activities. Specifically, this was how the carbon emissions and removals of Fallow land -> Fallow land, Fallow land -> Cleared land, and Cleared land -> Fallow land were calculated: The latest year of disturbance of Fallow land was determined using the annual shifting cultivation maps. Then, the AGBD of fallow lands was calculated using equation (1). Using AGBD of fallow land in the end year minus AGBD in the start year of each period, the differences in AGBD were obtained. Multiply the differences in AGBD by the area of different activities and then multiply it by the conversion factor (0.5), and the carbon emissions and removals of each activity were calculated. The average emission/removal factors were calculated using the emissions and removals divided by the total area of activities in different categories. A large proportion of the land used for shifting cultivation in Laos remains in use. During our study period, the estimated area of Active Shifting Cultivation (19.1 ± 1.6%) exceeded the area of Inactive Shifting Cultivation (13.7 ± 1.8%). In the future, there is a possibility of reusing Inactive Shifting Cultivation and further increasing the area of Active Shifting Cultivation, given the increasing demand for crops. New Shifting Cultivation, defined as shifting cultivation that first occurred in each period, was estimated from 2001 to 2020 by period (figure 4). The area estimates were aggregated into 5 year periods instead of calculating annual to reduce uncertainties of the area estimates. From 1991 to 2000, it is difficult to tell whether the shifting cultivation areas were new or old, and thus this analysis started in 2001. In all 5 year periods, the area of New Shifting Cultivation is higher than 3% of Laos, implying that on average, over 0.6% of Laos' land area is converted from intact forest to shifting cultivation each year. Our results indicate that the extent of shifting cultivation has been expanding. During 2001-2015, there was a decrease in the area of New Shifting Cultivation. However, both the area of New Shifting Cultivation and the total area of Shifting Cultivation have increased significantly in 2016-2020. The proportion of previously and newly cultivated to the total area of shifting cultivation was calculated for every year using the annual maps (figure 5). Before 2007, the newly cultivated areas were larger than the previously cultivated, and the trend reversed after 2007. There was a general decreasing trend in the proportion of New Shifting Cultivation, but increases were observed in 2019 and 2020. We suppose that the general decreasing trend is because intact forests available for cultivation decreased over time and previously cultivated land is easier to clear for future cultivation. Based on the sample interpretation results, most cultivation lengths are either one year or two years. Although there are variations across the years, we have not seen major changes in average cultivation length (figures S1 and S2). The mean length of the fallow periods of shifting cultivation in Laos is 6.5 years, which is close to the length of fallow periods reported in the literature (7 years) (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018). The fallow length has been continuously declining (figure 6). The reduction in the length of fallow periods indicates that shifting cultivation has intensified. The AGBD was lower in shifting cultivation regions than in the intact forests. The median AGBD of Intact Forest, Inactive Shifting Cultivation, Active Shifting Cultivation, and Others are 151.9 Mg ha -1 , 87.9 Mg ha -1 , 39.5 Mg ha -1 , and 22.8 Mg ha -1 , respectively. The biomass of Inactive Shifting Cultivation only reached about 60% of that of the intact forest. In the literature (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018), the regions of Inactive Shifting Cultivation were considered to be 'recovered' , whereas our results show that the AGBD is not recovered even if these regions have been left for fallow for at least seven years. To investigate the relationship between median AGBD and disturbance history, a logarithmic regression was conducted on years of regrowth since the latest slash-and-burn events and median AGBD of GEDI footprints (figure 7). The logarithmic model of years of regrowth (x) and AGBD (y) is (R square is 0.93): y = 29.129 ln (x) + 9.907 (1) AGBD was strongly correlated with years of regrowth. Equation ( 1) and the maps of years of regrowth were used to calculate the biomass of fallow lands and spatially explicit emission/removal factors (figure 8). in this period (figure 5) and the decrease in carbon sink of fallow lands in this period. For every period, New Shifting Cultivation is the largest carbon source, contributing to more than 80% of the total emissions. From 2001 to 2020, New Shifting Cultivation contributed to 89% of the total emissions. Fallow lands are important carbon sinks and sequestered about 70% of the total emissions during 2006-2015. However, carbon sequestration of fallow lands also decreased in recent years because of the intensified use of fallow land. To summarize, the increase in emissions from shifting cultivation encroachment to intact forests (New Shifting Cultivation) and intensified use of secondary forests both led to the recent increase in net emissions from shifting cultivation. In this study, the spatial-temporal patterns and the carbon dynamics of shifting cultivation in Laos were analyzed. The results showed that shifting cultivation has been expanding and intensifying. The area of shifting cultivation has increased significantly over the last 5 years. The fallow length has been declining continuously, which indicates the intensification of shifting cultivation. Our finding of a reduction of fallow length is consistent with previous local studies (Rasul and Thapa 2003, Saphangthong and Kono 2009, van Vliet et al 2012). We found that AGBD was strongly correlated with years of regrowth since the latest year of slash-and-burn activities, which can be 2019) is understandable since their major focus is forest loss instead of shifting cultivation. This comparison is not a criticism of the aforementioned studies. Instead, it highlights the benefits of using shifting cultivation maps and reference samples with better spatial resolution and high temporal frequency for the analysis of spatial-temporal patterns. We compared our area estimates of New Shifting Cultivation with the official forest change statistics from Laos (table S1). The Laos official forest change maps (https://nfms.maf.gov.la/) are created from the land cover classification maps from the start year and end year for each period (see the periods in table S1). Since shifting cultivation is the major driver of forest degradation and deforestation in Laos, we expect that there are some consistencies between the areas of New Shifting cultivation and the areas of forest degradation and deforestation. There are consistencies in the period 2006-2010 and 2011-2015, with the differences between our estimates and the official statistics both less than 1% of Laos. Our estimates of New Shifting Cultivation are generally higher than the Laos official estimates of deforestation and forest degradation, except for 2006-2010. This was partly due to the different monitoring approaches. Without using dense time series, the shifting cultivation events that occurred over five years may be difficult to detect using two classification maps from the start and the end. In the period 2001-2005 and 2016-2020, our estimates are about 2%-3% higher than the official estimates. For 2016-2020, the discrepancy is partly because the 2019 and 2020 changes are included in our estimates but not in the official statistics. Overall, our results and area estimates provide valuable information regarding the forest dynamics of Laos. Furthermore, we compared the shifting cultivation map with the field survey data in the Laos National Forest Monitoring System https://nfms. maf.gov.la/. The shifting cultivation map was compared with 39 field points identified as 'Regenerating Vegetation' or 'Upland crop' in 2010, 2011, 2012, or 2019, since these two land cover classes are generally considered to have an association with shifting cultivation practices (Department of Forestry 2020). The 31 out of 39 (80%) points are correctly mapped as shifting cultivation. As a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos, our research is valuable to sustainable land resource management. The sustainability of the land is negatively impacted by the recent expansion and intensification of shifting cultivation, indicated by an increase in newly cultivated areas in 2016-2020 and a reduction of fallow length in 1991-2020. Moreover, our research provides a quantitative analysis of carbon emissions of shifting cultivation, which is crucial for REDD+ reporting in Laos. Our research indicates that carbon emissions from shifting cultivation can be quantified by combining GEDI data with shifting cultivation maps and area estimates. The fallow land sequestrated a significant amount of carbon in the past, but this carbon sink declined in recent years. The recent increase in new shifting cultivation events also led to an increase in net carbon emissions. This highlights the importance of protecting the primary forest from the encroachment of new shifting cultivation and the restoration of old fallow lands. Our study has several limitations and future research can make improvements by using more sophisticated models and integration with other data. The first limitation is the usage of GEDI data. Our research only used GEDI in one year (2020), because GEDI is a new mission and 2020 was the only year with good coverage data when the study was conducted. Future studies can use GEDI for multiple years as more data will be collected. In addition, we excluded GEDI points where the slope is larger than 20 • to avoid overestimation of biomass in steep terrain. This would introduce regional bias on the growth curve and emission factors. Based on our map, 69% of the shifting cultivation area is in places with slopes less than 20 • (Chen 2022). Future research should improve GEDI biomass estimates in steep terrain. Second, although we compared our map with some field survey data in Laos, the field data information for each location is limited. Future studies should collect more detailed information on shifting cultivation in field surveys, especially biomass in shifting cultivation landscapes (e.g. Salinas-Melgoza et al 2017, Borah et al 2018, Gogoi et al 2020 ). Third, the carbon estimation only considered aboveground biomass change and no other carbon pools due to a lack of field survey data on those carbon pools. Future research can conduct field surveys on belowground biomass and include the belowground carbon pools in carbon emission estimation. Fourth, future research should investigate the causes of the recent increase in shifting cultivation, which requires field surveys. Our research provides a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos. Our analysis shows that shifting cultivation in Laos has been expanding and intensifying, particularly in the recent 5 years. The practice of shifting cultivation has become increasingly intensive as the length of the fallow periods has been continuously shortening. Combining GEDI data with shifting cultivation maps and area estimates, carbon emissions from shifting cultivation can be quantified. The net carbon emissions from shifting cultivation declined in the past but increased recently. This study not only supports REDD+ reporting for Laos but also demonstrates a method of tracking carbon dynamics in shifting cultivation landscapes. USGS Landsat Science Team Program for Better Use of the Landsat Temporal Domain: Monitoring Land Cover Type, Condition and Change (Grant Number: G12PC00070). The authors are grateful to the editors and two anonymous reviewers for their insightful and constructive comments, which greatly helped to improve this paper. ZeroOur warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.\n" +
+ "\t\t\tWe thank Pedro Baptista de Castro for his support during this work.\n" +
+ "\t\t\tSpecial thanks to Erina Fujita for useful tips on the manuscript.\n" +
+ "\t\tAlthough shifting cultivation is the major land use type in Laos, the spatial-temporal patterns and the associated carbon emissions of shifting cultivation in Laos are largely unknown.
+ This study provides a nationwide analysis of the spatial-temporal patterns of shifting cultivation and estimations of the associated carbon emissions in Laos over the last three decades.
+ This study found that shifting cultivation has been expanding and intensifying in Laos, especially in the last 5 years.
+ The newly cultivated land from 2016 to 2020 accounted for 4.5% (±1.2%) of the total land area of Laos.
+ Furthermore, the length of fallow periods has been continuously declining, indicating that shifting cultivation is becoming increasingly intensive.
+ Combining biomass derived from Global Ecosystem Dynamics Investigation and shifting cultivation maps and area estimates, we found that the net carbon emissions from shifting cultivation declined in 2001-2015 but increased in 2016-2020.
+ The largest carbon source is conversion from intact forests to shifting cultivation, which contributed to 89% of the total emissions from 2001 to 2020.
+ In addition, there were increased emissions from intensified use of fallow lands.
+ This research provides useful information for policymakers in Laos to understand the changes in shifting cultivation and improve land use management.
+ This study not only supports Reducing Emissions from Deforestation and Forest Degradation reporting for Laos but also provides a methodology for tracking carbon emissions and removals of shifting cultivation.
+ Shifting cultivation is an agricultural practice where farmers routinely move from one plot to another for cultivation.
+ It begins with the practice of 'slash-andburn' , where trees and woody plants are cut down and burnt to prepare an ash-fertilized plot for temporary cultivation.
+ After short-term cultivation, the plot is abandoned, which allows the vegetation to recover.
+ Shifting cultivation is the predominant land use and a major cause of forest degradation and deforestation in some tropical countries (Heinimann et al 2017
+ , Curtis et al 2018
+ , Jiang et al 2022), such as Laos (Chen et al 2023), and the Democratic Republic of Congo (Molinario et al 2015).
+ Monitoring shifting cultivation is complicated, because it is highly dynamic, and the area affected by each slash-and-burn event is small.
+ Due to the difficulty of monitoring shifting cultivation, spatially and temporally explicit information on shifting cultivation is scarce.
+ Shifting cultivation has both short-term and long-term effects on carbon emissions (Ziegler et al 2012).
+ In the short term, the slash-and-burn activities cause immediate release of carbon.
+ In the long term, encroachment of shifting cultivation into primary forest and intensified use of secondary forest both lead to long-term increases in net carbon emissions and degradation of ecosystems.
+ Carbon emissions from shifting cultivation have not been well quantified, because of the lack of methodology for monitoring shifting cultivation and tracking the associated carbon dynamics.
+ In contrast to deforestation (such as urbanization), which does not involve carbon sequestration, shifting cultivation involves both carbon emissions associated with slash-and-burn activities and carbon sequestration during the fallow period.
+ Due to the complexity of monitoring shifting cultivation and tracking the associated carbon dynamics, estimates of carbon emissions or sequestration from shifting cultivation are usually unavailable in REDD+ (Reducing Emissions from Deforestation and Forest Degradation) reporting.
+ In Laos, officially the Lao People's Democratic Republic (Lao PDR), shifting cultivation is an important agricultural system (Roder 2001
+ , Douangsavanh et al 2006
+ , Epprecht et al 2018
+ , Manivong and Cramb 2020) and the major driver of forest dynamics (Curtis et al 2018
+ , Chen et al 2023).
+ It is estimated that shifting cultivation affected 32.9 ± 1.9% of Laos from 1991 to 2020, and the shifting cultivation activities increased in the most recent 5 years (Chen et al 2023).
+ Laos' population has been increasing steadily from 4.314 million in 1990 to 7.319 million in 2020 (World Bank 2023), whereas upland rice yields did not distinctly improve between 1990 and 2020.
+ Shifting cultivation activities are expected to increase due to the increasing demand for rice.
+ Monitoring shifting cultivation and analyzing its patterns are important to understand the forest cover change in Laos and relevant to achieving Laos' goal of increasing forest cover to 70% (The current forest cover is 62%) (The Government of Lao PDR 2005).
+ Since there were few spatially and temporally explicit maps and estimates of shifting cultivation before Chen et al (2023), carbon emissions from shifting cultivation have not been accurately estimated in the REDD+ reporting of Laos (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018).
+ Spatially and temporally explicit information about shifting cultivation in Laos was unavailable until recently (Chen et al 2023), and a comprehensive national-scale analysis of the spatial and temporal patterns of shifting cultivation has not been conducted to date.
+ A traditional approach for mapping shifting cultivation is to create landscape mosaics based on a land cover map of a single year (Messerli et al 2009
+ , Silva et al 2011
+ , Hett et al 2012
+ , Hurni et al 2013a).
+ It is impossible to analyze the temporal patterns of shifting cultivation using this traditional approach.
+ Another approach is to use multi-temporal land cover data to map shifting cultivation (Leisz and Rasmussen 2012
+ , Molinario et al 2015
+ , Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018
+ , Adhikary et al 2019
+ , Kurien et al 2019).
+ In previous studies, the temporal resolution of the land cover maps was not high enough to support the analysis of temporal patterns (Heinimann et al 2013).
+ Recently, Chen et al (2023) used satellite data to create shifting cultivation products for Laos with sufficient temporal frequency (annual) and spatial resolution (30 m) to support a nationalscale spatial-temporal analysis.
+ The recently launched GEDI (Global Ecosystem Dynamics Investigation) mission provides new opportunities for estimating biomass at a large scale (Tang et al 2020).
+ This study used the map products and reference data in Chen et al (2023), combined with GEDI, to conduct a national-scale analysis of the spatial and temporal patterns and carbon dynamics of shifting cultivation in Laos.
+ The goal is to understand the spatial and temporal patterns of shifting cultivation and the associated carbon emissions, in support of decision-making to reduce carbon emissions and promote sustainable livelihoods depending on shifting cultivation.
+ Shifting cultivation was mapped using Landsat data from 1987 to 2020 on Google Earth Engine (Chen et al 2023).
+ CCDC-SMA (continuous change detection and classification-spectral mixture analysis) (Zhu and Woodcock 2014, Bullock et al 2020, Chen et al 2021) was used to detect forest disturbances in Laos.
+ CCDC-SMA fits harmonic models to fractions of endmembers and NDFI (Normalized Difference Fraction Index) (Souza et al 2005) to monitor forest disturbances (figure 1).
+ Annual maps of Shifting Cultivation from 1991 to 2020 were created by combining time series analysis, object-based image analysis, and post-disturbed land-cover classification.
+ A total of 1000 sample units under simple random sampling were used as reference data for accuracy assessment and area estimation.
+ For each sample unit, at least two interpreters interpreted the land change class and the year of each slash-and-burn event by examining high-resolution satellite imagery and Landsat time series (figures 1(a), (c) and 2).
+ During 1991-2020, shifting cultivation was the main type of forest disturbance in Laos, affecting 32.9 ± 1.9% of Laos (Chen et al 2023).
+ Shifting cultivation was mapped with a producer's accuracy of 88% and a user's accuracy of 80% (Chen et al 2023).
+
+ Chen et al (2023) describes more details of the monitoring method.
+ Both the map products and the reference data from Chen et al (2023) were used in this study.
+ The annual maps of shifting cultivation and the reference sample units interpreted as Shifting Cultivation were used to investigate the patterns of shifting cultivation.
+ We estimated the area of shifting cultivation at different fallow and disturbance 55 ′ 27 ′′ E. In the time series plot, the blue points are the Landsat observations.
+ In the Landsat images (Red-green-blue), the yellow squares show the pixel location.
+ In the high-resolution image, the white point shows the pixel location.).
+ to explore whether the extent of shifting cultivation expanded, the newly and previously cultivated areas of shifting cultivation were estimated using reference sample units and maps in Chen et al (2023) for every 5 year period from 2000 to 2020.
+ In the reference sample points, whether a pixel is newly or previously cultivated is determined by the year of slash-andburn recorded by the interpreters (e.g.
+ figures 1(a), (c) and 2).
+ Furthermore, to investigate the change patterns in fallow length and cultivation length (length of cropping period), we visually interpreted Landsat time series, Landsat imagery, and high-resolution images for 196 sample points (figure 3 as an example).
+ These sample points are the points with at least two cultivation events in the aforementioned reference data with 1000 simple random sample points.
+ For each point, the year of slash and burn (land clearing), cultivation length, and fallow length are recorded for every event.
+ The GEDI mission provides space-borne LiDAR data to estimate aboveground biomass (Healey et al 2020).
+ GEDI's L4A Footprint Level Aboveground Biomass Density (AGBD) (version 2.1) 25 m data (Beck et al 2020
+ , Dubayah et al 2022) were used to explore the effect of shifting cultivation on biomass.
+ GEDI data collected in 2020 was used because it was the only year of data with good spatial coverage when the study was conducted.
+ To overlay the GEDI footprint and Landsat, for each GEDI footprint, we extracted the value of the 30 m pixel in the Landsat-based map that has the largest overlap with the 25 m footprint.
+ Only lidar observations with good quality (using the 'quality_flag' band and the 'degrade_flag' band) and collected at places with a slope less than 20 • and in the interior of shifting cultivation sites (excluding a two-pixel edge) were used, to eliminate the effect of terrain and possible misregistration at the edges of slash-and-burn events.
+ The reason why we excluded lidar points with slopes larger than 20 • is that GEDIbased biomass estimates tend to be overestimated at steep terrain.
+ AGBD was calculated for Active Shifting Cultivation, Inactive Shifting Cultivation, Intact Forest, and Others.
+ Intact Forest here is defined as forests without significant anthropogenic disturbances.
+ The relationship between AGBD and years of regrowth since the latest slash-and-burn events was analyzed.
+ The hypothesis was that AGBD has a positive relationship with years of regrowth since the latest slash-andburn activity.
+ From this relationship, a country-level growth curve of AGBD can be developed and used to estimate the biomass of fallow lands.
+ Carbon emissions from shifting cultivation were estimated for every 5 year period from 2001 to 2020.
+ Table 1 shows the activity classes, definitions, and emission factors.
+ New Shifting Cultivation area was estimated from a sampling-based method The emission factors for activities other than New Shifting Cultivation are spatially explicit and were determined by the map of the latest year of slash and burn and the growth curve.
+ Figure 8 shows an example of the spatially explicit emission factors for different activities.
+ Specifically, this was how the carbon emissions and removals of Fallow land -> Fallow land, Fallow land -> Cleared land, and Cleared land -> Fallow land were calculated: The latest year of disturbance of Fallow land was determined using the annual shifting cultivation maps.
+ Then, the AGBD of fallow lands was calculated using equation (1).
+ Using AGBD of fallow land in the end year minus AGBD in the start year of each period, the differences in AGBD were obtained.
+ Multiply the differences in AGBD by the area of different activities and then multiply it by the conversion factor (0.5), and the carbon emissions and removals of each activity were calculated.
+ The average emission/removal factors were calculated using the emissions and removals divided by the total area of activities in different categories.
+ A large proportion of the land used for shifting cultivation in Laos remains in use.
+ During our study period, the estimated area of Active Shifting Cultivation (19.1 ± 1.6%) exceeded the area of Inactive Shifting Cultivation (13.7 ± 1.8%).
+ In the future, there is a possibility of reusing Inactive Shifting Cultivation and further increasing the area of Active Shifting Cultivation, given the increasing demand for crops.
+ New Shifting Cultivation, defined as shifting cultivation that first occurred in each period, was estimated from 2001 to 2020 by period (figure 4).
+ The area estimates were aggregated into 5 year periods instead of calculating annual to reduce uncertainties of the area estimates.
+ From 1991 to 2000, it is difficult to tell whether the shifting cultivation areas were new or old, and thus this analysis started in 2001.
+ In all 5 year periods, the area of New Shifting Cultivation is higher than 3% of Laos, implying that on average, over 0.6% of Laos' land area is converted from intact forest to shifting cultivation each year.
+ Our results indicate that the extent of shifting cultivation has been expanding.
+ During 2001-2015, there was a decrease in the area of New Shifting Cultivation.
+ However, both the area of New Shifting Cultivation and the total area of Shifting Cultivation have increased significantly in 2016-2020.
+ The proportion of previously and newly cultivated to the total area of shifting cultivation was calculated for every year using the annual maps (figure 5).
+ Before 2007, the newly cultivated areas were larger than the previously cultivated, and the trend reversed after 2007.
+ There was a general decreasing trend in the proportion of New Shifting Cultivation, but increases were observed in 2019 and 2020.
+ We suppose that the general decreasing trend is because intact forests available for cultivation decreased over time and previously cultivated land is easier to clear for future cultivation.
+ Based on the sample interpretation results, most cultivation lengths are either one year or two years.
+ Although there are variations across the years, we have not seen major changes in average cultivation length (figures S1 and S2).
+ The mean length of the fallow periods of shifting cultivation in Laos is 6.5 years, which is close to the length of fallow periods reported in the literature (7 years) (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018).
+ The fallow length has been continuously declining (figure 6).
+ The reduction in the length of fallow periods indicates that shifting cultivation has intensified.
+ The AGBD was lower in shifting cultivation regions than in the intact forests.
+ The median AGBD of Intact Forest, Inactive Shifting Cultivation, Active Shifting Cultivation, and Others are 151.9
+ Mg ha -1 , 87.9 Mg ha -1 , 39.5 Mg ha -1 , and 22.8 Mg ha -1 , respectively.
+ The biomass of Inactive Shifting Cultivation only reached about 60% of that of the intact forest.
+ In the literature (Department of Forestry, Ministry of Agriculture and Forestry, Lao PDR 2018), the regions of Inactive Shifting Cultivation were considered to be 'recovered' , whereas our results show that the AGBD is not recovered even if these regions have been left for fallow for at least seven years.
+ To investigate the relationship between median AGBD and disturbance history, a logarithmic regression was conducted on years of regrowth since the latest slash-and-burn events and median AGBD of GEDI footprints (figure 7).
+ The logarithmic model of years of regrowth (x) and AGBD (y) is (R square is 0.93): y = 29.129
+ ln (x) + 9.907
+ (1)
+ AGBD was strongly correlated with years of regrowth.
+ Equation ( 1) and the maps of years of regrowth were used to calculate the biomass of fallow lands and spatially explicit emission/removal factors (figure 8). in this period (figure 5) and the decrease in carbon sink of fallow lands in this period.
+ For every period, New Shifting Cultivation is the largest carbon source, contributing to more than 80% of the total emissions.
+ From 2001 to 2020, New Shifting Cultivation contributed to 89% of the total emissions.
+ Fallow lands are important carbon sinks and sequestered about 70% of the total emissions during 2006-2015.
+ However, carbon sequestration of fallow lands also decreased in recent years because of the intensified use of fallow land.
+ To summarize, the increase in emissions from shifting cultivation encroachment to intact forests (New Shifting Cultivation) and intensified use of secondary forests both led to the recent increase in net emissions from shifting cultivation.
+ In this study, the spatial-temporal patterns and the carbon dynamics of shifting cultivation in Laos were analyzed.
+ The results showed that shifting cultivation has been expanding and intensifying.
+ The area of shifting cultivation has increased significantly over the last 5 years.
+ The fallow length has been declining continuously, which indicates the intensification of shifting cultivation.
+ Our finding of a reduction of fallow length is consistent with previous local studies (Rasul and Thapa 2003
+ , Saphangthong and Kono 2009
+ , van Vliet et al 2012).
+ We found that AGBD was strongly correlated with years of regrowth since the latest year of slash-and-burn activities, which can be 2019) is understandable since their major focus is forest loss instead of shifting cultivation.
+ This comparison is not a criticism of the aforementioned studies.
+ Instead, it highlights the benefits of using shifting cultivation maps and reference samples with better spatial resolution and high temporal frequency for the analysis of spatial-temporal patterns.
+ We compared our area estimates of New Shifting Cultivation with the official forest change statistics from Laos (table S1).
+ The Laos official forest change maps (https://nfms.maf.gov.la/) are created from the land cover classification maps from the start year and end year for each period (see the periods in table S1).
+ Since shifting cultivation is the major driver of forest degradation and deforestation in Laos, we expect that there are some consistencies between the areas of New Shifting cultivation and the areas of forest degradation and deforestation.
+ There are consistencies in the period 2006-2010 and 2011-2015, with the differences between our estimates and the official statistics both less than 1% of Laos.
+ Our estimates of New Shifting Cultivation are generally higher than the Laos official estimates of deforestation and forest degradation, except for 2006-2010.
+ This was partly due to the different monitoring approaches.
+ Without using dense time series, the shifting cultivation events that occurred over five years may be difficult to detect using two classification maps from the start and the end.
+ In the period 2001-2005 and 2016-2020, our estimates are about 2%-3% higher than the official estimates.
+ For 2016-2020, the discrepancy is partly because the 2019 and 2020 changes are included in our estimates but not in the official statistics.
+ Overall, our results and area estimates provide valuable information regarding the forest dynamics of Laos.
+ Furthermore, we compared the shifting cultivation map with the field survey data in the Laos National Forest Monitoring Systemhttps://nfms. maf.gov.la/.
+ The shifting cultivation map was compared with 39 field points identified as 'Regenerating Vegetation' or 'Upland crop' in 2010
+ , 2011
+ , 2012
+ , or 2019, since these two land cover classes are generally considered to have an association with shifting cultivation practices (Department of Forestry 2020).
+ The 31 out of 39 (80%) points are correctly mapped as shifting cultivation.
+ As a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos, our research is valuable to sustainable land resource management.
+ The sustainability of the land is negatively impacted by the recent expansion and intensification of shifting cultivation, indicated by an increase in newly cultivated areas in 2016-2020 and a reduction of fallow length in 1991-2020.
+ Moreover, our research provides a quantitative analysis of carbon emissions of shifting cultivation, which is crucial for REDD+ reporting in Laos.
+ Our research indicates that carbon emissions from shifting cultivation can be quantified by combining GEDI data with shifting cultivation maps and area estimates.
+ The fallow land sequestrated a significant amount of carbon in the past, but this carbon sink declined in recent years.
+ The recent increase in new shifting cultivation events also led to an increase in net carbon emissions.
+ This highlights the importance of protecting the primary forest from the encroachment of new shifting cultivation and the restoration of old fallow lands.
+ Our study has several limitations and future research can make improvements by using more sophisticated models and integration with other data.
+ The first limitation is the usage of GEDI data.
+ Our research only used GEDI in one year (2020), because GEDI is a new mission and 2020 was the only year with good coverage data when the study was conducted.
+ Future studies can use GEDI for multiple years as more data will be collected.
+ In addition, we excluded GEDI points where the slope is larger than 20 • to avoid overestimation of biomass in steep terrain.
+ This would introduce regional bias on the growth curve and emission factors.
+ Based on our map, 69% of the shifting cultivation area is in places with slopes less than 20 • (Chen 2022).
+ Future research should improve GEDI biomass estimates in steep terrain.
+ Second, although we compared our map with some field survey data in Laos, the field data information for each location is limited.
+ Future studies should collect more detailed information on shifting cultivation in field surveys, especially biomass in shifting cultivation landscapes (e.g.
+ Salinas-Melgoza et al 2017, Borah et al 2018
+ , Gogoi et al 2020 ).
+ Third, the carbon estimation only considered aboveground biomass change and no other carbon pools due to a lack of field survey data on those carbon pools.
+ Future research can conduct field surveys on belowground biomass and include the belowground carbon pools in carbon emission estimation.
+ Fourth, future research should investigate the causes of the recent increase in shifting cultivation, which requires field surveys.
+ Our research provides a national-level analysis of spatial-temporal patterns and estimation of carbon dynamics of shifting cultivation in Laos.
+ Our analysis shows that shifting cultivation in Laos has been expanding and intensifying, particularly in the recent 5 years.
+ The practice of shifting cultivation has become increasingly intensive as the length of the fallow periods has been continuously shortening.
+ Combining GEDI data with shifting cultivation maps and area estimates, carbon emissions from shifting cultivation can be quantified.
+ The net carbon emissions from shifting cultivation declined in the past but increased recently.
+ This study not only supports REDD+ reporting for Laos but also demonstrates a method of tracking carbon dynamics in shifting cultivation landscapes.
+ USGS Landsat Science Team Program for Better Use of the Landsat Temporal Domain: Monitoring Land Cover Type, Condition and Change (Grant Number: G12PC00070).
+ The authors are grateful to the editors and two anonymous reviewers for their insightful and constructive comments, which greatly helped to improve this paper.
+ Carbon emissions from shifting cultivation were estimated by period from 2001 to 2020 (tables 2-4, and figure 9).
+ The net carbon emissions of shifting cultivation declined during 2001-2015, but significantly increased during 2016-2020.
+ The decline in net carbon emissions during 2001-2015 is mostly because the area of new shifting cultivation decreased in this period.
+ The increase during 2016-2020 is mostly due to the increase in new shifting cultivation activities
+ Activity classes, definitions, and carbon emission/removal factors for each 5 year period (CF: conversion factor to convert biomass to carbon equivalents, CF = 0.5).
+
+
+ Zeroand other activity classes in table 1 were estimated from the maps.This is because the samplingbased area estimates of New Shifting Cultivation adjusted errors in mapping and are more accurate than pixel-counting from the maps(Olofsson et al 2013(Olofsson et al , 2014)).
+ The area estimates of New Shifting Cultivation were calculated by 5-year periods with low uncertainty.
+ For other activity classes, it is difficult to get area estimates from the reference data while including the dynamics of biomass of fallow land, and thus we used a spatially explicit method.
+ In table 1, the biomass of the forest before disturbance was the biomass of Intact Forest estimated from GEDI.
+ The biomass of fallow land was estimated from the growth curve developed from GEDI based on years since disturbance.
+ Years since disturbance for each pixel was obtained from the annual maps of shifting cultivation.
+ The cleared land biomass was estimated as the biomass of non-forest by the Department of Forestry (2020) based on field surveys.
+ The emission factor of New Shifting Cultivation is 75.95
+ Mg C ha -1 .
+ The emission factor of Cleared land -> Cleared land is zero.
+ Area of difference land use activities for each period (5 years).
+
+
+ The country-average emissions or removal factors for each period (5 years).
+ The original emission or removal factors except for new shifting cultivation are spatially explicit.
+ This table shows the country averages of the spatial explicit emission or removal factors.
+
+
+ Carbon emissions (+) and removals (-) of different activities for each period (5 years).
+
+
+ Saphangthong T and Kono Y 2009 Continuity and discontinuity in land use changes: a case study in Northern Lao villages J. Southeast Asian Stud.
+ 47 263-86 Silva J M N, Carreiras J M B, Rosa I and Pereira J M C 2011 Greenhouse gas emissions from shifting cultivation in the tropics, including uncertainty and sensitivity analysis J. Geophys.
+ Res.
+ Atmos.
+ 116 1-21 Souza C M, Roberts D A and Cochrane M A 2005 Combining spectral and spatial information to map canopy damage from selective logging and forest fires Remote Sens. Environ.
+ 98 329-43 Tang X, Hutyra L R, Arévalo P, Baccini A, Woodcock C E and Olofsson P 2020 Spatiotemporal tracking of carbon emissions and uptake using time series analysis of Landsat data: a spatially explicit carbon bookkeeping model Sci.
+ Total Environ.
+ 720 137409 The Government of Lao PDR 2005 Forestry Strategy to the year 2020 of the Lao PDR (available at: https://faolex.fao.org/ docs/pdf/lao144178.pdf)
+ van Vliet N et al 2012 Trends, drivers and impacts of changes in swidden cultivation in tropical forest-agriculture frontiers: a global assessment Glob.
+ Environ.
+ Change 22 418-29 World Bank 2023 World Bank open data (available at: https:// data.worldbank.org/) Zhu Z and Woodcock C E 2014 Continuous change detection and classification of land cover using all available Landsat data Remote Sens. Environ.
+ 144 152-71 Ziegler A D et al 2012 Carbon outcomes of major land-cover transitions in SE Asia: great uncertainties and REDD+ policy implications Glob.
+ Change Biol.
+ 18 3087-99
+
+
+
+
Google Earth Engine applications to visualize the datasets:https://github.com/shijuanchen/shift_cult Map products visualization: https://sites.google. com/view/shijuanchen/research/shift_cult
+
+ The data that support the findings of this study are openly available at the following URL/DOI:https:// doi.org/10.5281/zenodo.7782782.
+
This research was funded by the
Google Earth Engine applications to visualize the datasets: https://github.com/shijuanchen/shift_cult Map products visualization: https://sites.google. com/view/shijuanchen/research/shift_cult
The data that support the findings of this study are openly available at the following URL/DOI: https:// doi.org/10.5281/zenodo.7782782.
We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon.Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work.This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections.Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples.For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer.We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'.Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.
This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon.We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.
The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2].This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties.As a matter of course, such an approach requires a larger amount of material-related data for training models.Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics.Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects.Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality.Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments.This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.
On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6].Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods.This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.
The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12].However, the updates of SuperCon have become increasingly challenging due to the high publication rate.In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications.This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1].Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process.We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record.At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information.There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.
In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file.We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.
Our contributions can be summarised as follows:
• We developed a workflow and a user interface that allow the curation of a machine-collected database.We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach.The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it.Finally, we discuss our evaluation experiments and results in Section 4.
The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1).Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else.When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.
Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).
Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).
The workflow state is determined by the 'curation status' (Section 2.1.1),the user action, and the error type (Section 2.1.2).
The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:
• new: default status when a new record is created.
• curated: the record has been amended manually.
• validated: the record was manually marked as valid.
• invalid: the record is wrong or inappropriate for the situation (e.g.T m or T curie extracted as superconducting critical temperature).
• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ).• removed: the record has been removed by a curator (internal status).
We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection.Users are required to select one Error Type at every record update or removal.This information is stored in the 'original' record and can be different at every record modification.The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).
• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper.This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface.• Curation amends: The curator is updating the data which does not present issues due to the automatic system.
Anomaly detection is the process of identifying unusual events or patterns in data.In our context, this means identifying data that are greatly different from the expected values.This post-process was introduced in a limited scope to draw attention to certain cases during the curation.
The anomaly detection uses a rule-based approach and marks any record that matches the following conditions
• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g.'41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.
Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification.Since this process may find false positives, its output requires validation from curators.For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.
We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]).The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values.Further analysis and cross-references with contrasting information may be added in future.
The curation process is a valuable endeavour demanding significant knowledge and human effort.To maximise the use of this time for collecting as much information as possible.We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.
In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information.This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.
We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status.The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them.We integrated our interface with Labelstudio [17] for the correction of the collected training examples.Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).
The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1).It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3).The detailed schema, including examples, is reported in our previous work [1].
During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence).Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).
In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.
We selected curators from domain experts in the field, to certify sufficient data quality.Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result.We followed two principles to guarantee robustness in the curation process.First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18].Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.
The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions.The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).
Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2.Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.
The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the .Each row contains one potential training data example.Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17].The column 'status' indicate whether the example has been sent or not to the external tool.curation process (curation log).The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top).For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).
The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).
In this section, we illustrate the experiments we have run to evaluate our work.The evaluation is composed of three sets of results.The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation.Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file.In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.
We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation.Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas.Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure.This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.
We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2).Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models.We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.
We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1].We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations.We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction.A model can be trained with two different strategies:
(1) 'from scratch': when the model is initialised randomly.We denote this strategy with an (s).(2) 'incremental': when the initial model weights are taken from an already existing model.We denote this strategy with an (i).
The latter can be seen as a way to 'continue' the training from a specific checkpoint.We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting.The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.
This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1.F1-score from the evaluation of the fine-tuned SciBERT models.The training is performed with three different approaches.The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.s indicate 'training from scratch', while i indicate 'incremental training'.The evaluation is performed using the same holdout dataset from SuperMat [18].The results are averaged over five runs or train and evaluation.77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively.This experiment gives interesting insight relative to the positive impact on the way we select the training data.However, there are some limitations: the curation dataset is small compared to the base dataset.This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores.A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.
We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).
We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS).Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method.Overall, each pair of curators had five papers in common which they had to process using opposite methods.For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method.After curation, a fourth individual manually reviewed the curated content.The raw data is available in Tables A1 andA2.
We evaluated the curation considering a double perspective: time and correctness.Time was calculated as the accumulated minutes required using each method.Correctness was assessed using standard measures such as precision, recall, and the F1-score.Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information.F1-Score is a harmonic means of precision and recall.
Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method.When the experiment was carried out, not all the curators were familiar with the interface method.Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.
We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3).The F1-score improved by 39.35%.
The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills.Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4).Furthermore, we observed a modest improvement between master's students and PhD students.These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice.Thus, using only a few senior researchers for the second round of validation (Section 3.1).
Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.
The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.
This work is available athttps://github.com/lfoppiano/ supercon2.The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models.The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.
We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8].The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer.Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model.Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively.In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.
1. 'internal status' indicates that their records should be hidden in the interface.2. In our previous work [1] we reported 77.03% F1score.There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0.One cause may be the use of different hyperparameters in version 0.3.0such as batch size and learning rate.However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.
The material is incorrectly linked to the T c given that the entities are correctly recognised.
• Linking: • T as 'superconductors critical temperature' (e.g.Curie temperature, Magnetic temperature. ..).
Evaluation
scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher).Each person corrected 10 documents.
Evaluation scores (P: precision, R: recall, F1: F1-score) between the curation using the SuperCon 2 interface (Interface) and the traditional method of reading the PDF document (PDF document.).
Evaluation scores (P: precision, R: recall, F1: F1-score) listed by experience (MS: master student, PD: PhD student, SR: senior researcher), and method (PDF document, interface).
Evaluation scores obtained for each document and method (I: interface, P: PDF) combination.TP: true positive, FP: false positive, FN: false negative.P: precision, R: recall, F1: F1-score.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 2 L. FOPPIANO et al.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 3 L. FOPPIANO et al.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 5 L. FOPPIANO et al.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 6 L. FOPPIANO et al.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 9L.FOPPIANO et al.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 10 L. FOPPIANO et al.
Sci. Technol.Adv.Mater.Meth. 3 (2023) 12 L. FOPPIANO et al.
Our warmest thanks to
Materials Modelling Group, Data-driven
This work was partly supported by
No potential conflict of interest was reported by the author(s).
LF wrote the manuscript and KT helped with the editing.LF and POS discussed the ML results and experiments.LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface.LF designed the user interface experiment with KT, TT and WS as curators.KT led the materials-science work on the data with CS, TT and WS.KT, TA, YT and MI revised the paper.YT and MI supervised the work of the respective teams.
Table A1.Timetable recording the time spent for each of the 15 articles.Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR).Duration is expressed in minutes.
We propose a semi-automatic staging area for efficiently building an accurate database of experimental physical properties of superconductors from literature, called SuperCon 2 , to enrich the existing manually-built superconductor database SuperCon. Here we report our curation interface (SuperCon 2 Interface) and a workflow managing the state transitions of each examined record, to validate the dataset of superconductors from PDF documents collected using Grobidsuperconductors in a previous work. This curation workflow allows both automatic and manual operations, the former contains 'anomaly detection' that scans new data identifying outliers, and a 'training data collector' mechanism that collects training data examples based on manual corrections. Such training data collection policy is effective in improving the machine-learning models with a reduced number of examples. For manual operations, the interface (SuperCon 2 interface) is developed to increase efficiency during manual correction by providing a smart interface and an enhanced PDF document viewer. We show that our interface significantly improves the curation quality by boosting precision and recall as compared with the traditional 'manual correction'. Our semi-automatic approach would provide a solution for achieving a reliable database with text-data mining of scientific documents.
This work makes a contribution to the realms of materials informatics and superconductors research, achieved through the evolution and update of SuperCon. We provide results from experiments that support the utilisation of computational analysis and machine learning for collecting experimental data from scientific articles.
The emergence of new methodologies using machine learning for materials exploration has given rise to a growing research area called materials informatics (MI) [1,2]. This field leverages the knowledge of the materials data accumulated in the past to efficiently screen candidates of the materials with desired properties. As a matter of course, such an approach requires a larger amount of material-related data for training models. Researchers have been developing large aggregated databases of physical properties generated by first-principles calculations based on Density Functional Theory (DFT), such as Materials Project [3], JARVIS (Joint Automated Repository for Various Integrated Simulations) [4], NOMAD (Novel Materials Discovery) [5], that played a role of a strong driving force for the development of materials informatics. Using DFT data for machine learning (ML) in materials science has become popular since, in principle, it allows researchers to simulate and obtain various types of physical properties of the target materials only by knowing the crystal structures of the subjects. Those DFT codes are designed to reproduce/simulate the physical properties that should be observed by experiments in reality. Nonetheless, caution must be exercised while utilising these computed figures for constructing ML models aimed at steering experiments. This caution arises due to the potential lack of validity in their predictions when dealing with specific simplifications of the interactions between atoms and electrons in solids, such as electron-electron Coulomb correlation, spinorbit coupling, and similar factors.
On the contrary, accumulated datasets of experimental data from scientific publications are still scarce, despite abundant publication availability, and exponential growth in materials science [6]. Currently, only a few limited resources exist, such as the Pauling File [7] and SuperCon [8], necessitating reliance on manual extraction methods. This scarcity can be attributed to inadequate infrastructure and a shortage of expertise in computer science within the materials science field.
The SuperCon database was built manually from 1987 [8] by the National Institute for Materials Science (NIMS) in Japan and it is considered a reliable source of experimental data on superconductors [9][10][11][12]. However, the updates of SuperCon have become increasingly challenging due to the high publication rate. In response to the need for a more efficient approach to sustain productivity, we embarked on the development of an automated system for extracting material and property information from the text contained in relevant scientific publications. This automated process enabled the rapid creation of 'SuperCon 2 Database', a comprehensive database of superconductors containing around 40,000 entries, within an operational duration of just a few days [1]. Matching the level of quality seen in SuperCon while simultaneously automating the extraction of organised data can be achieved with a properly designed curation process. We use the term curation to describe the overall process of reviewing and validating database records, while correction refers to the specific action of altering the values of one or more properties within an individual record. At the moment of writing this article, we are not aware of any other curation tool focusing on structured databases of extracted information. There are several tools for data annotation, such as Inception [13], and Doccano [14] which concentrate on text labelling and classification.
In this work, we designed and developed a workflow with a user interface, 'SuperCon 2 Interface', crafted to produce structured data of superior quality and efficiency to the one obtained by the 'traditional' manual approach consisting of reading documents and noting records, usually on an Excel file. We developed this framework around the specific use case of SuperCon, however, our goal is to be adapted to alternative data frameworks.
Our contributions can be summarised as follows:
• We developed a workflow and a user interface that allow the curation of a machine-collected database. We demonstrate that using it for data correction resulted in higher quality than the 'traditional' (manual) approach. The subsequent sections, Section 2 describes the curation workflow and Section 3 the user interface on top of it. Finally, we discuss our evaluation experiments and results in Section 4.
The curation of the SuperCon 2 Database acts as a workflow where user actions result in database records state transitions (Figure 1). Allowed manual actions include a) mark as valid (validation) when a record is considered correct or corrected by someone else. When a record is not valid, users can: b) mark as invalid when considered 'potentially' invalid (or the curator is not confident), c) perform manual correction to update it according to the information from the original PDF document, and d) remove the record when it was not supposed to be extracted.
Besides manual operations from users, this workflow supports also automatic actions: 'anomaly detection' for pre-screening records (Section 2.2) and the 'training data collector' for accumulating training data for improving ML models (Section 2.3).
Although only the most recent version of a record can be viewed on this system, the correction history is recorded (Section 3.3).
The workflow state is determined by the 'curation status' (Section 2.1.1), the user action, and the error type (Section 2.1.2).
The curation status (Figure 1) is defined by type of action, manual or automatic, and status, which can assume the following values:
• new: default status when a new record is created.
• curated: the record has been amended manually.
• validated: the record was manually marked as valid.
• invalid: the record is wrong or inappropriate for the situation (e.g. T m or T curie extracted as superconducting critical temperature).
• obsolete: the record has been updated and the updated values are stored in a new record (internal status 1 ). • removed: the record has been removed by a curator (internal status).
We first introduced error type in [1] and extended their scope in this work to consider data curation and anomaly detection. Users are required to select one Error Type at every record update or removal. This information is stored in the 'original' record and can be different at every record modification. The error type values can be summarised as follows: • Composition resolution: The exact composition cannot be resolved (e.g. the stoichiometric values cannot be resolved).
• Value resolution: The extracted formula contains variables that cannot be resolved, even after having read the paper. This includes when data is from tables • Anomaly detection: The data has been modified by anomaly detection, which facilitates their retrieval from the interface. • Curation amends: The curator is updating the data which does not present issues due to the automatic system.
Anomaly detection is the process of identifying unusual events or patterns in data. In our context, this means identifying data that are greatly different from the expected values. This post-process was introduced in a limited scope to draw attention to certain cases during the curation.
The anomaly detection uses a rule-based approach and marks any record that matches the following conditions
• the extracted T c is greater than room temperature (273 K), negative, or contains invalid characters and cannot be parsed (e.g. '41]') • the chemical formula cannot be processed by an ensemble composition parser that combines Pymatgen [15], and text2chem [16] • the extracted applied pressure cannot be parsed or falls outside the range 0-250 GPa.
Records identified as anomalies have status 'invalid' and error type 'anomaly detection' for easy identification. Since this process may find false positives, its output requires validation from curators. For example, in certain contexts, T c values above room temperature or applied pressure up to 500 GPa may be valid in researchers' hypotheses, calculations, or simulated predictions.
We ran the anomaly detection on the full SuperCon 2 Database (40324 records [1]). The anomaly detection identified 1506 records with invalid T c , 5021 records with an incomplete chemical formula, 304 records with invalid applied pressure, and 1440 materials linked to multiple T c values. Further analysis and cross-references with contrasting information may be added in future.
The curation process is a valuable endeavour demanding significant knowledge and human effort. To maximise the use of this time for collecting as much information as possible. We integrated an automatic procedure in the curation process that, for every correction, accumulates the related data examples that can be used to improve the underlying ML models.
In the event of a correction (update, removal) in a database record, this process retrieves the corresponding raw data: the text passage, the recognised entities (spans), and the layout tokens information. This information is sufficient to be exported as training examples, which can be examined and corrected, and feedback to the ML model.
We designed a specific page of the interface (Section 3) to manage the collected data (Figure 2) in which each row corresponds to a training example composed by the decorated text showing the identified entities, the document identifier, and the status. The users can examine the data, delete it, send it to the annotation tool to be corrected, and then export them. We integrated our interface with Labelstudio [17] for the correction of the collected training examples. Label-studio is an open-source, python-based, and modern interface supporting many different TDM tasks (NER, topic modelling, image recognition, etc.).
The workflow is operated through the user interface, which offers several key features to facilitate the data curation process (Figure 1). It provides a comprehensive view of materials and their related properties as a table which includes search, filtering, and sorting functionality (Figure 3). The detailed schema, including examples, is reported in our previous work [1].
During the curation process, it is often necessary to switch back and forth between the database record and the related context in the paper (the related paragraph or sentence). Our interface provides a viewer for individual documents, which visualises in the same window a table with the extracted records and the original PDF document decorated with annotations that identify the extracted materials and properties (Figure 4).
In this section, we discuss our strategy concerning manual curation, which is still indispensable for developing high-quality structures.
We selected curators from domain experts in the field, to certify sufficient data quality. Nevertheless, as confirmed from our experiment in Section 4.3, the experience of each individual may have an impact on the final result. We followed two principles to guarantee robustness in the curation process. First, we built solid curation documentation as a form of example-driven guidelines with an iterative approach we first introduced in [18]. Then, we used a double-round validation approach, in which the data was initially corrected by one person, and validated in a second round, by a different individual.
The guidelines consist mainly of two parts: the general principles and the correction rules with examples of solutions. The guidelines are designed to provide general information applied to corrections and very basic explanations containing illustrations for a faster understanding (e.g. the meaning of the colours of the annotations).
Differently from our previous work [18], these guidelines are divided into examples for different scenarios based on the error types mentioned in Section 2.1.2. Each example described the initial record, its context, the expected corrected record and a brief explanation, as illustrated in Figure 5.
The Supercon 2 interface gives access to information regarding the ingestion (processing log) and the . Each row contains one potential training data example. Each example is composed of a sentence and its extracted entities (highlighted in colour) with potential annotation mistakes that need to be corrected using an external tool: we used label-studio [17]. The column 'status' indicate whether the example has been sent or not to the external tool. curation process (curation log). The processing log is filled up when the new data is ingested, it was built to have minimal functions able to explain why certain documents haven't been processed (Figure 6 top). For example, sometimes documents fail because they don't contain any text (image PDF documents) or they are too big (more than 100 pages).
The curation log provides a view of what, when and how a record has been corrected (Figure 6 bottom).
In this section, we illustrate the experiments we have run to evaluate our work. The evaluation is composed of three sets of results. The anomaly detection rejection rate (Section 4.1) indicates how many anomalies were rejected by curators after validation. Then, we demonstrate that the training data automatically selected contributed to improving the ML model with a small set of examples (Section 4.2) Finally, we evaluated the quality of the data extraction using the interface (and the semi-automatic TDM process) against the classical method of reading the PDF articles and noting the experimental information in an Excel file. In Section 4.3 we find out that using the interface improves the quality of the curated data by reducing missing experimental data.
We evaluated the anomaly detection by observing the 'rejection rate' which consists of the number of detected anomalies that were rejected by human validation. Running the anomaly detection on a database subset with 667 records, it found 17 anomalies in T c , 1 anomaly in applied pressure, and 16 anomalies in the chemical formulas. Curators examined each reported record and rejected 4 (23%) anomalies in T c , 6 anomalies (37%) in chemical formulas and 0 anomalies in applied pressure. This indicates an appropriate low rate of false positives although a study with a larger dataset might be necessary.
We selected around 400 records in the Supercon 2 Database that were marked as invalid by the anomaly detection process and we corrected them following the curation guidelines (Section 3.2). Then, we examined the corresponding training data corrected by the interface (Section 2.3) and obtained a set of 352 training data examples for our ML models. We call the obtained dataset curation to be distinguished from the original SuperMat dataset which is referred to as base.
We prepared our experiment using SciBERT [19] that we fine-tuned for our downstream task as in [1]. We trained five models that we evaluated using a fixed holdout dataset from SuperMat averaging the results to smooth out the fluctuations. We use the DeLFT (Deep Learning For Text) [20] library for training, evaluating, and managing the models for prediction. A model can be trained with two different strategies:
(1) 'from scratch': when the model is initialised randomly. We denote this strategy with an (s). (2) 'incremental': when the initial model weights are taken from an already existing model. We denote this strategy with an (i).
The latter can be seen as a way to 'continue' the training from a specific checkpoint. We thus define three different training protocols: We merge 'curation' with the base dataset because the curation dataset is very small compared to 'base', and we want to avoid catastrophic forgetting [21] or overfitting. The trained models are then tested using a fixed holdout dataset that we designed in our previous work [1] and the evaluation scores are shown in Table 1.
This experiment demonstrates that with only 352 examples (2% of the SuperMat dataset) comprising 1846 additional entities (11% of the entities from the SuperMat dataset) (Table 2), we obtain an improvement of F1-score from 76.67% 2 to values between Table 1. F1-score from the evaluation of the fine-tuned SciBERT models. The training is performed with three different approaches. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. s indicate 'training from scratch', while i indicate 'incremental training'. The evaluation is performed using the same holdout dataset from SuperMat [18]. The results are averaged over five runs or train and evaluation. 77.44% (+0.77) and 77.48% (+0.81) for (base+curation)(s) and base(s)+(base+curation)(i), respectively. This experiment gives interesting insight relative to the positive impact on the way we select the training data. However, there are some limitations: the curation dataset is small compared to the base dataset. This issue could be verified by correcting all the available training data, repeating this experiment, and studying the interpolation between the size of the two datasets and the obtained evaluation scores. A second limitation is that the hyperparameters we chose for our model, in particular, the learning rate and batch size could be still better tuned to obtain better results with the second and third training protocols.
We conducted an experiment to evaluate the effectiveness and accuracy of data curation using two methods: a) the user interface (interface), and b) the 'traditional' manual approach consisting of reading PDF documents and populating an Excel file (PDF documents).
We selected a dataset of 15 papers, which we assigned to three curators -a senior researcher (SD), a PhD student (PS), and a master's student (MS). Each curator received 10 papers: half to be corrected with the interface and half with the PDF Document method. Overall, each pair of curators had five papers in common which they had to process using opposite methods. For instance, if curator A receives paper 1 to be corrected with the interface, curator B, who receives the same paper 1, will correct it with the PDF document method. After curation, a fourth individual manually reviewed the curated content. The raw data is available in Tables A1 andA2.
We evaluated the curation considering a double perspective: time and correctness. Time was calculated as the accumulated minutes required using each method. Correctness was assessed using standard measures such as precision, recall, and the F1-score. Precision measures the accuracy of the extracted information, while recall assesses the ability to capture all expected information. F1-Score is a harmonic means of precision and recall.
Overall, both methods required the same accumulated time: 185 minutes using the interface and 184 minutes using the PDF Document method. When the experiment was carried out, not all the curators were familiar with the interface method. Although they had access to the user documentation, they had to get acquainted with the user interface, thus the accumulated 185 minutes included such activities.
We examined the quality of the extracted data and we observed an improvement of + 5.55% in precision and a substantial + 46.69% in recall when using the interface as compared with the PDF Document method (Table 3). The F1-score improved by 39.35%.
The disparity in experience significantly influenced the accuracy of curation, particularly in terms of highlevel skills. Senior researchers consistently achieved an average F1-Score approximately 13% higher than other curators (see Table 4). Furthermore, we observed a modest improvement between master's students and PhD students. These findings indicate also that for large-scale projects, employing master students instead of PhD students may be a more costeffective choice. Thus, using only a few senior researchers for the second round of validation (Section 3.1).
Finally, the collected data suggest that all three curators had overall more corrected results by using the interface as illustrated in Table 5.
The results of this experiment confirmed that our curation interface and workflow significantly improved the quality of the extracted data, with an astonishing improvement in recall, thus preventing curators from overlooking important information.
This work is available athttps://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected.
We built a semi-automatic staging area, called SuperCon 2 , to validate efficiently new experimental records automatically collected from superconductor research articles (SuperCon 2 Database [1]) before they are ingested into the existing, manually-build database of superconductors, SuperCon [8]. The system provides a curation workflow and a user interface (SuperCon 2 Interface) tailored to efficiently support domain experts in data correction and validation with fast context switching and an enhanced PDF viewer. Under the hood, the workflow ran 'anomaly detection' to automatically identify outliers and a 'training data collector' based on human corrections, to efficiently accumulate training data to be feedback to the ML model. Compared with the traditional manual approach of reading PDF documents and extracting information in an Excel file, SuperCon 2 significantly improves the curation quality by approximately 6% and + 47% for precision and recall, respectively. In future, this work can be expanded to support other materials science domains such as magnetic materials, spintronic and thermoelectric research and expanding the evaluation to a larger [22] dataset.
1. 'internal status' indicates that their records should be hidden in the interface. 2. In our previous work [1] we reported 77.03% F1score. There is a slight decrease in absolute scores between DeLFT 0.2.8 and DeLFT 0.3.0. One cause may be the use of different hyperparameters in version 0.3.0 such as batch size and learning rate. However, the most probable cause could be the impact of using the Huggingface tokenizers library which is suffering from quality issueshttps://github.com/kermitt2/delft/issues/150.
• Linking: • T as 'superconductors critical temperature' (e.g. Curie temperature, Magnetic temperature. . .).
scores (P: precision, R: recall, F1: F1-score) aggregated by experience (MS: master student, PD: PhD student, SR: senior researcher). Each person corrected 10 documents.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 2 L. FOPPIANO et al.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 3 L. FOPPIANO et al.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 5 L. FOPPIANO et al.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 6 L. FOPPIANO et al.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 9L. FOPPIANO et al.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 10 L. FOPPIANO et al.
Sci. Technol. Adv. Mater. Meth. 3 (2023) 12 L. FOPPIANO et al.
Our warmest thanks to
Materials Modelling Group, Data-driven
This work was partly supported by
No potential conflict of interest was reported by the author(s).
LF wrote the manuscript and KT helped with the editing. LF and POS discussed the ML results and experiments. LF implemented the workflow as a standalone service, and TM wrote the front end of the user interface. LF designed the user interface experiment with KT, TT and WS as curators. KT led the materials-science work on the data with CS, TT and WS. KT, TA, YT and MI revised the paper. YT and MI supervised the work of the respective teams.
Table A1. Timetable recording the time spent for each of the 15 articles. Each row indicates the time and the event (Start, Finish) from each of the curators: master student (MD), PhD student (PD), and senior researcher (SR). Duration is expressed in minutes.