Skip to content

Commit

Permalink
Add variable line to output even if questions are missing
Browse files Browse the repository at this point in the history
  • Loading branch information
Emerson Farrugia committed Jul 14, 2021
1 parent 3083a88 commit 7678e46
Show file tree
Hide file tree
Showing 10 changed files with 1,455 additions and 30 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ To look up a variable, it should be included in the `variables.csv` CSV file. An
```
pl,plj0014_v1
biol,lr3039
pl,plj0001
```

# Output
Expand All @@ -16,10 +17,12 @@ The output corresponding to the above input is:
variable_dataset_id, variable_name, variable_reference_url, question_period, question_instrument, question_text, question_reference_url
pl,plj0014_v1,https://paneldata.org/soep-core/data/pl/plj0014_v1,2019,Individual (CAPI) 2019,What is your country of citizenship?,https://paneldata.org/soep-core/inst/soep-core-2019-pe-lgb/178
biol,lr3039,https://paneldata.org/soep-core/data/biol/lr3039,2019,"Individual and Biography (M3-M5, Initial interview) 2019","How much was your last monthly net income for this occupation, i.e. the amount paid to you in the aforementioned currency?",https://paneldata.org/soep-core/inst/soep-core-2019-pb-m345-erst/Q194
pl,plj0001,https://paneldata.org/soep-core/data/pl/plj0001,n/a,n/a,n/a,n/a
```

Note that if a variable references multiple questions, only the first question found on paneldata.org is included in the output CSV.
Note that
- if a variable references multiple questions, only the first question found on paneldata.org is included in the output CSV
- if a variable referenecs no questions, it's still included in the output CSV, but with "n/a" for the question fields

The code already has support for multiple questions, so this should be straightforward to change.

Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ plugins {
}

group = "internal"
version = "0.1.1"
version = "0.2.0"

val springBootVersion = "2.5.2" // duplicated above

Expand Down
38 changes: 18 additions & 20 deletions src/main/kotlin/internal/svs/BatchConfiguration.kt
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,25 @@ class BatchConfiguration(
@Autowired private var stepBuilderFactory: StepBuilderFactory
) {
@Bean
fun questionDownloadingJob(questionDownloadingStep: Step): Job {
return jobBuilderFactory["questionDownloadingJob"]
fun questionDownloadingJob(questionDownloadingStep: Step): Job =
jobBuilderFactory["questionDownloadingJob"]
.incrementer(RunIdIncrementer())
.flow(questionDownloadingStep)
.end()
.build()
}

@Bean
fun questionDownloadingStep(
variableReader: ItemReader<Variable>,
variablePageContentLoader: VariablePageContentLoader,
variablePageQuestionExtractor: VariablePageQuestionExtractor,
questionWriter: ItemWriter<Question>
): Step {
return stepBuilderFactory["questionDownloadingStep"]
.chunk<Variable, Question>(CHUNK_SIZE)
questionWriter: ItemWriter<VariableQuestion>
): Step =
stepBuilderFactory["questionDownloadingStep"]
.chunk<Variable, VariableQuestion>(CHUNK_SIZE)
.reader(variableReader)
.processor(
CompositeItemProcessorBuilder<Variable, Question>()
CompositeItemProcessorBuilder<Variable, VariableQuestion>()
.delegates(
variablePageContentLoader,
variablePageQuestionExtractor
Expand All @@ -55,7 +54,6 @@ class BatchConfiguration(
)
.writer(questionWriter)
.build()
}

@Bean
fun variableReader(): ItemReader<Variable> =
Expand All @@ -74,20 +72,20 @@ class BatchConfiguration(
.build()

@Bean
fun questionWriter(): ItemWriter<Question> =
FlatFileItemWriterBuilder<Question>()
fun questionWriter(): ItemWriter<VariableQuestion> =
FlatFileItemWriterBuilder<VariableQuestion>()
.name("questionWriter")
.resource(FileSystemResource("output.csv"))
.delimited()
.fieldExtractor { question ->
.fieldExtractor {
listOf(
question.variable.datasetId,
question.variable.name,
question.variable.referenceUrl,
question.period,
question.instrument,
question.text,
question.referenceUrl
it.variable.datasetId,
it.variable.name,
it.variable.referenceUrl,
it.question?.period ?: "n/a",
it.question?.instrument ?: "n/a",
it.question?.text ?: "n/a",
it.question?.referenceUrl ?: "n/a",
)
.map { value -> value.toString().escapeCsv() }
.toTypedArray()
Expand All @@ -107,5 +105,5 @@ class BatchConfiguration(
}
.build()

private fun String.escapeCsv():String = StringEscapeUtils.escapeCsv(this)
private fun String.escapeCsv(): String = StringEscapeUtils.escapeCsv(this)
}
1 change: 0 additions & 1 deletion src/main/kotlin/internal/svs/Question.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package internal.svs
import java.net.URL

data class Question(
val variable: Variable,
val period: String,
val instrument: String,
val text: String,
Expand Down
1 change: 0 additions & 1 deletion src/main/kotlin/internal/svs/VariablePageContent.kt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ data class VariablePageContent(

private fun Element.asQuestion() =
Question(
variable = variable,
period = select("td:eq(0)").text(),
instrument = select("td:eq(1)").text(),
text = select("td:eq(2)").text(),
Expand Down
9 changes: 6 additions & 3 deletions src/main/kotlin/internal/svs/VariablePageQuestionExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ import org.springframework.batch.item.ItemProcessor
import org.springframework.stereotype.Component

@Component
class VariablePageQuestionExtractor : ItemProcessor<VariablePageContent, Question> {
class VariablePageQuestionExtractor : ItemProcessor<VariablePageContent, VariableQuestion> {

override fun process(variablePageContent: VariablePageContent): Question? =
variablePageContent.questionList.firstOrNull()
override fun process(variablePageContent: VariablePageContent): VariableQuestion? =
VariableQuestion(
variable = variablePageContent.variable,
question = variablePageContent.questionList.firstOrNull()
)
}
9 changes: 9 additions & 0 deletions src/main/kotlin/internal/svs/VariableQuestion.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package internal.svs

/**
* A variable-question tuple.
*/
data class VariableQuestion(
val variable: Variable,
val question: Question?
)
11 changes: 9 additions & 2 deletions src/test/kotlin/internal/svs/VariablePageContentTests.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class VariablePageContentTests {
}

@Test
fun `should extract large question list correctly`() {
val variablePageContent = asVariablePageContent("plj0014_v1.html")
fun `should construct page content correctly from page with questions`() {
val variablePageContent = asVariablePageContent("page-with-questions-plj0014_v1.html")

assertThat(variablePageContent.questionList).hasSize(44)

Expand All @@ -30,4 +30,11 @@ class VariablePageContentTests {
assertThat(referenceUrl.toString()).isEqualTo("$PANEL_DATA_SITE_DOMAIN/soep-core/inst/soep-core-2019-pe-lgb/178")
}
}

@Test
fun `should construct page content correctly despite missing question section`() {
val variablePageContent = asVariablePageContent("page-missing-question-section-plj0001.html")

assertThat(variablePageContent.questionList).hasSize(0)
}
}
Loading

0 comments on commit 7678e46

Please sign in to comment.