diff --git a/report_thesis/src/references.bib b/report_thesis/src/references.bib index 2c9a1300..3cfb5689 100644 --- a/report_thesis/src/references.bib +++ b/report_thesis/src/references.bib @@ -113,6 +113,17 @@ @article{yang_laser-induced_2022 keywords = {Convolutional neural network, Laser-induced breakdown spectroscopy, {MarSCoDe}, Multi-distance spectra, Sampling distance} } +@online{marsnasagov_msl, + title = {Mars Science Laboratory {\textbar} Missions}, + url = {https://mars.nasa.gov/mars-exploration/missions/mars-science-laboratory}, + abstract = {{NASA}'s real-time portal for Mars exploration, featuring the latest news, images, and discoveries from the Red Planet.}, + titleaddon = {{NASA} Mars Exploration}, + author = {mars.nasa.gov}, + urldate = {2024-03-12}, + langid = {english}, + note = {Last~Accessed: 2024-03-12} +} + @online{marsnasagov_vikings, title = {Viking 1 \& 2 {\textbar} Missions}, url = {https://mars.nasa.gov/mars-exploration/missions/viking-1-2}, @@ -120,7 +131,8 @@ @online{marsnasagov_vikings titleaddon = {{NASA} Mars Exploration}, author = {mars.nasa.gov}, urldate = {2024-01-23}, - langid = {english} + langid = {english}, + note = {Last~Accessed: 2024-01-23} } @online{marsnasagov_observer, @@ -130,17 +142,8 @@ @online{marsnasagov_observer titleaddon = {{NASA} Mars Exploration}, author = {mars.nasa.gov}, urldate = {2024-01-23}, - langid = {english} -} - -@misc{marsnasagov_chemcam, - title = {ChemCam}, - url = {https://mars.nasa.gov/msl/spacecraft/instruments/chemcam/}, - journal = {NASA}, - publisher = {NASA}, - author = {Lanza, Nina}, - year = {2022}, - month = {May} + langid = {english}, + note = {Last~Accessed: 2024-01-23} } @online{marsnasagov_spirit_opportunity, @@ -151,6 +154,7 @@ @online{marsnasagov_spirit_opportunity author = {mars.nasa.gov}, urldate = {2024-01-23}, langid = {english} + note = {Last~Accessed: 2024-01-23} } @article{jeonEffectsFeatureEngineering2024, @@ -334,7 +338,8 @@ @online{bankAutoencoders2021 url = {http://arxiv.org/abs/2003.05991}, urldate = {2024-02-26}, abstract = {An autoencoder is a specific type of a neural network, which is mainly designed to encode the input into a compressed and meaningful representation, and then decode it back such that the reconstructed input is similar as possible to the original one. This chapter surveys the different types of autoencoders that are mainly used today. It also describes various applications and use-cases of autoencoders.}, - pubstate = {preprint} + pubstate = {preprint}, + note = {Last~Accessed:~2024-02-26} } @article{caruana_no_1997, diff --git a/report_thesis/src/sections/baseline.tex b/report_thesis/src/sections/baseline.tex index 9c78b065..bc89201e 100644 --- a/report_thesis/src/sections/baseline.tex +++ b/report_thesis/src/sections/baseline.tex @@ -1,5 +1,5 @@ \section{Baseline} -In \citet{p9_paper}, we presented our efforts to replicate the MOC model by \citet{cleggRecalibrationMarsScience2017}. +In \citet{p9_paper}, we presented our efforts to replicate the MOC model by~\citet{cleggRecalibrationMarsScience2017}. This effort was motivated by our desire to better understand the model and its performance, and to experiment with its components in order to determine how it could be improved. However, as discussed in that paper, there were some differences between our replica and the original model. These differences were caused by missing information in the original paper, and so rather than introducing our own assumptions, we designed experiments to determine the best way to replicate the model. @@ -21,15 +21,15 @@ \section{Baseline} Then we would do a random split on the remaining dataset, such that the final train/test split would be a $80\%/20\%$ split. With these changes, we have created a more accurate replica of the \gls{moc} model, which we will use as our baseline for the rest of this paper. -As an additional measure, we have presented these changes to one of the original authors of \citet{cleggRecalibrationMarsScience2017}, who confirmed that they were reasonable and in line with the original model's implementation. +As an additional measure, we have presented these changes to one of the original authors of~\citet{cleggRecalibrationMarsScience2017}, who confirmed that they were reasonable and in line with the original model's implementation. -Table \ref{tab:replica_results_rmses} shows the \gls{rmse}s of the original models and our replicas after these changes. +Table~\ref{tab:replica_results_rmses} shows the \gls{rmse}s of the original models and our replicas after these changes. Figure~\ref{fig:rmse_histograms} illustrates the distribution of these \gls{rmse}s as a grouped histogram. The results show that the \gls{rmse}s of our replicas exhibit similar tendencies to the original models. However, in some cases, our replicas have a lower \gls{rmse} than the original models, and in others, they have a higher \gls{rmse}. These differences are due to a number of factors. -Firstly, the original models were trained with datasets from 1600mm and 3000mm standoff distances\cite{cleggRecalibrationMarsScience2017}, while we only had access to the 1600mm dataset for our replicas. +Firstly, the original models were trained with datasets from 1600mm and 3000mm standoff distances~\cite{cleggRecalibrationMarsScience2017}, while we only had access to the 1600mm dataset for our replicas. Additionally, we automated the outlier removal for the PLS1-SM phase, unlike the original manual process. As mentioned, the original authors manually curated their training and test sets, ensuring a broad elemental range, while we implemented an automatic process for our replicas due to lack of domain expertise. Differences might also stem from varied implementation specifics, such as programming languages and libraries used. diff --git a/report_thesis/src/sections/introduction.tex b/report_thesis/src/sections/introduction.tex index b728a907..fbbef299 100644 --- a/report_thesis/src/sections/introduction.tex +++ b/report_thesis/src/sections/introduction.tex @@ -1,30 +1,30 @@ \section{Introduction}\label{sec:introduction} The NASA Viking missions in the 1970s were the first to successfully land on Mars, aiming to determine if life existed on the planet. One experiment suggested the presence of life, but the results were ambiguous and inconclusive, and NASA was unable to repeat the experiment. -Nevertheless, these missions were deemed a monumental success and advanced our knowledge of the Martian envi-\\ronment.\cite{marsnasagov_vikings} +Nevertheless, these missions were deemed a monumental success and advanced our knowledge of the Martian environment~\cite{marsnasagov_vikings}. Leveraging the knowledge gained from the Viking missions, NASA launched the \gls{mer} mission in 2003 to investigate whether Mars ever had the conditions to support life as we know it. The mission landed two rovers, Spirit and Opportunity, on Mars in January 2004, and they quickly discovered clear evidence that water once flowed on Mars. -However, since water alone is not enough to support life, the next objective was to search for organic material as well.\cite{marsnasagov_observer, marsnasagov_spirit_opportunity} +However, since water alone is not enough to support life, the next objective was to search for organic material as well~\cite{marsnasagov_observer, marsnasagov_spirit_opportunity}. The Curiosity rover landed on Mars in August 2012 inside Gale Crater as part of the \gls{msl} mission with this very purpose. -Thanks to its sophisticated equipment, Curiosity was able to find evidence of past habitable environments on Mars based on chemical and mineral findings early in its mission.\cite{marsnasagov_chemcam} +Thanks to its sophisticated equipment, Curiosity was able to find evidence of past habitable environments on Mars based on chemical and mineral findings early in its mission~\cite{marsnasagov_msl}. One of the instruments aboard the rover is the \gls{chemcam} instrument, which is a remote-sensing laser instrument used to gather \gls{libs} data from geological samples on Mars. -\gls{libs} is a technique that enables rapid analysis by using a laser to ablate and remove surface contaminants to expose the underlying material and generate a plasma plume from the now-exposed sample material\cite{wiensChemcam2012}. +\gls{libs} is a technique that enables rapid analysis by using a laser to ablate and remove surface contaminants to expose the underlying material and generate a plasma plume from the now-exposed sample material~\cite{wiensChemcam2012}. This plasma plume emits light that is captured through three distinct spectrometers to collect a series of spectral readings. These spectra consist of emission lines that can be associated with the concentration of a specific element, and their intensity reflects the concentration of that element in the sample. -Consequently, a spectra serves as a complex, multi-dimensional fingerprint of the elemental composition of the examined geological samples.\cite{cleggRecalibrationMarsScience2017} +Consequently, a spectra serves as a complex, multi-dimensional fingerprint of the elemental composition of the examined geological samples~\cite{cleggRecalibrationMarsScience2017}. Analyzing \gls{libs} data is computationally challenging due to high multicollinearity within spectral data, which diminishes the effectiveness of traditional linear analysis. The multicollinearity, which stems from correlations among spectral channels and elemental emission characteristics, complicates data interpretation. Additionally, \textit{matrix effects} in \gls{libs} spectra arise when various physical interactions cause emission line intensities to change without a corresponding shift in the element's actual concentration -This phenomenon introduces variability that complicates the straightforward interpretation of spectral data, challenging the accuracy of computational models tasked with predicting elemental composition.\cite{andersonImprovedAccuracyQuantitative2017} +This phenomenon introduces variability that complicates the straightforward interpretation of spectral data, challenging the accuracy of computational models tasked with predicting elemental composition~\cite{andersonImprovedAccuracyQuantitative2017}. -For analyzing Martian geological samples, the \gls{chemcam} team currently uses the \gls{moc} model\cite{cleggRecalibrationMarsScience2017}. +For analyzing Martian geological samples, the \gls{chemcam} team currently uses the \gls{moc} model~\cite{cleggRecalibrationMarsScience2017}. This model integrates \gls{pls} and \gls{ica} to predict the composition of major oxides. Though the MOC model has proven useful, it suffers from limitations in predictive accuracy and robustness. -In \citet{p9_paper}, we created a replica of the MOC model and identified which components were responsible for these limitations. +In~\citet{p9_paper}, we created a replica of the MOC model and identified which components were responsible for these limitations. Through a series of comparative experiments, we showed that the model selection was the primary cause of these limitations, and we showed how both \gls{ann} and \gls{gbr} methods could be used to improve the model's predictive accuracy and robustness. This is further underscored by work from the SuperCam team. diff --git a/report_thesis/src/sections/problem_definition.tex b/report_thesis/src/sections/problem_definition.tex index 9999dc85..50ccefea 100644 --- a/report_thesis/src/sections/problem_definition.tex +++ b/report_thesis/src/sections/problem_definition.tex @@ -1,11 +1,11 @@ \section{Problem} \label{sec:problem_definition} -Predicting major oxide compositions from \gls{libs} data presents significant computational challenges, including the high dimensionality and non-linearity of the data, compounded by multicollinearity and the phenomenon known as \textit{matrix effects}. +Predicting major oxide compositions from \gls{libs} data presents significant computational challenges, including the high dimensionality and non-linearity of the data, compounded by multicollinearity and the phenomenon known as matrix effects~\cite{andersonImprovedAccuracyQuantitative2017}. These effects can cause the intensity of emission lines from an element to vary independently of that element's concentration, introducing unknown variables that complicate the analysis. Furthermore, due to the high cost of data collection, datasets are often small, which further complicates the task of building accurate and robust models. -Building upon the baseline established in \citet{p9_paper}, our work aims to address the significant challenges inherent in predicting major oxide compositions from \gls{libs} data by improving the accuracy and robustness of these predictions. +Building upon the baseline established in~\citet{p9_paper}, our work aims to address the significant challenges inherent in predicting major oxide compositions from \gls{libs} data by improving the accuracy and robustness of these predictions. The presence of multicollinearity within the spectral data, for example, makes it difficult to discern distinct patterns due to the strong correlations among variables that can obscure the impact of individual predictors. Additionally, the high dimensionality of \gls{libs} data necessitates dimensionality reduction to manage the vast number of variables efficiently. @@ -13,10 +13,10 @@ \section{Problem} Given the limited size of available datasets, our methodologies must also be robust against overfitting and capable of generalizing well from small sample sizes. To this end, we propose the exploration of advanced ensemble methods and deep learning models, selected for their potential to handle high-dimensional, non-linear data effectively. -In addressing the limitations of the current \gls{moc} pipeline identified in \citet{p9_paper}, we have prioritized dimensionality reduction and model selection. +In addressing the limitations of the current \gls{moc} pipeline identified in~\citet{p9_paper}, we have prioritized dimensionality reduction and model selection. This decision is supported by the low incidence of outliers in the \gls{chemcam} \gls{libs} calibration dataset. Dimensionality reduction is essential for managing the high-dimensional nature of \gls{libs} data, and model selection offers the opportunity to explore a wider range of algorithms, potentially leading to improved performance. -Our focus on advanced ensemble methods like \gls{gbr} and deep learning models like \gls{ann}s is motivated by their demonstrated ability to outperform the existing \gls{moc} pipeline in handling complex data scenarios, as shown in \citet{p9_paper} and \citet{andersonPostlandingMajorElement2022}. +Our focus on advanced ensemble methods like \gls{gbr} and deep learning models like \gls{ann}s is motivated by their demonstrated ability to outperform the existing \gls{moc} pipeline in handling complex data scenarios, as shown in~\citet{p9_paper} and~\citet{andersonPostlandingMajorElement2022}. To evaluate the performance of these models, we will use \gls{rmse} as a proxy for accuracy, defined by the equation: @@ -33,6 +33,42 @@ \section{Problem} where $e_i = y_i - \hat{y}_i$ and $\bar{e}$ is the mean error. +For data normalization, \citet{cleggRecalibrationMarsScience2017} proposes two different normalization methods, Norm 1 and Norm 3. +Norm 1 normalizes the data across the entire spectrum, while Norm 3 normalizes the data across individual spectrometers' wavelength ranges. + +Formally, Norm 1 is defined as: + +\begin{equation} + \tilde{X}_{i,j} = \frac{X_{i,j}}{\sum_{j=1}^{N} X_{i,j}}, +\end{equation} + +where, + +\begin{itemize} + \item $\tilde{X}_{i,j}$ is the normalized wavelength intensity for the $i$-th sample in the $j$-th channel. + \item $X_{i,j}$ is the original wavelength intensity for the $i$-th sample in the $j$-th channel. + \item $N$ is the number of channels in each spectrometer (for the \gls{chemcam} instrument, $N = 2048$). +\end{itemize} + +Norm 3 is defined as: + +\begin{equation} + \tilde{X}_{j,k}^{(i)} = \frac{X_{j,k}}{\sum_{k'=N(i-1)+1}^{N \cdot i} X_{j,k'}}, +\end{equation} + +where: + +\begin{itemize} + \item $\tilde{X}_{j,k}^{(i)}$ is the normalized wavelength intensity for the $j$-th sample in the $k$-th channel on the $i$-th spectrometer. + \item $X_{j,k}$ is the original wavelength intensity for the $j$-th sample in the $k$-th channel on the $i$-th spectrometer. + \item $N$ is the number of channels in each spectrometer (for the \gls{chemcam} instrument, $N = 2048$) + \item $i$ is the index of the spectrometer (for the \gls{chemcam} instrument, $i \in \{1, 2, 3\}$) + \item $k'$ indexes over the channels of the $i$-th spectrometer, ensuring the $N$ channels of each spectrometer are normalized separately. +\end{itemize} + +Following the approach taken by the SuperCam team, we opt to always normalize across individual spectrometers' wavelength ranges (Norm 3), rather than normalizing across the entire spectrum (Norm 1). +This decision is guided by the approach taken by the SuperCam team, where they do not normalize across the entire spectrum, but rather across individual spectrometers' wavelength ranges~\cite{andersonPostlandingMajorElement2022}. + The goal of improving both robustness and accuracy is to ensure that our models can generalize well to new data and provide reliable predictions in the presence of noise and uncertainty. Essentially, the model should be as accurate as possible, as often as possible. It is undesirable for a model to be accurate only in specific scenarios, as this would limit its practical utility. @@ -44,7 +80,4 @@ \section{Problem} \item Ensure methods are feasible for small datasets. \end{itemize} -Following the approach taken by the SuperCam team, we opt to always normalize across individual spectrometers' wavelength ranges (Norm 3), rather than normalizing across the entire spectrum (Norm 1). -This decision is guided by the approach taken by the SuperCam team, where they do not normalize across the entire spectrum, but rather across individual spectrometers' wavelength ranges\cite{andersonPostlandingMajorElement2022}. - Through these focused objectives and methodologies, our work seeks to improve the prediction of major oxide compositions in Martian geological samples. \ No newline at end of file