-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScript_calculate_MTLD.jl
124 lines (108 loc) · 3.26 KB
/
Script_calculate_MTLD.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Julia script to calculate MTLD (Measure of Textual Lexical Diversity) in one file_pathway
# Earl Kjar Brown
using Statistics
using BenchmarkTools
using Dates
# remove Arabic numerals and dashes
function preprocess(txt)
txt = replace(lowercase(txt), r"[0-9]+" => "")
txt = replace(txt, "–" => "")
txt = replace(txt, "—" => "")
txt = replace(txt, "-" => "")
return txt
end
# tokenize text into words
function tokenize(txt)
txt = preprocess(txt)
punct = "!\"#\$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
for p in punct
txt = replace(txt, p => " ")
end
wds = split(txt, r"\s+")
filter!(x -> length(x) > 0, wds)
return wds
end
# define the workhorse MTLD function
function sub_mtld(wds, threshold, reversed)
if reversed
reverse!(wds)
end
# println(wds)
n_wds::Int = length(wds)
# println("n_wds: ", n_wds)
terms = Set{String}()
word_counter = 0
factor_count = 0
for w in wds
word_counter += 1
push!(terms, w)
# println("length(terms): ", length(terms))
ttr = length(terms) / word_counter
# println(w)
# println("ttr: ", ttr)
if ttr::Float64 <= threshold::Float64
# println("ttr just dropped below the threshold")
word_counter = 0
terms = Set{String}()
# println("length(term): ", length(terms))
factor_count += 1
# println("current factor_count: ", factor_count)
end
end # next word
if word_counter > 0
factor_count += (1 - ttr) / (1 - threshold)
end
if factor_count == 0
ttr = length(Set(wds)) /
if ttr == 1
factor_count += 1
else
factor_count += (1 - ttr) / (1 - threshold)
end
end
# println("n_wds: ", n_wds, ", factor_count: ", factor_count)
output = n_wds / factor_count
# println(output)
return output
end
# define main MTLD function
function mtld(wds)
forward_measure = sub_mtld(wds, 0.72, false)
reverse_measure = sub_mtld(wds, 0.72, true)
output = mean([forward_measure, reverse_measure])
return output
end
# get text out of file on harddrive
function get_txt(pathway::String)
open(pathway::String) do infile
return read(infile, String)
end
end
# main function
function get_mtld(file_pathway::String)
txt = get_txt(file_pathway::String)
wds = tokenize(txt)
return mtld(wds)
end
# helper function to run ten trials to benchmark algorithm
function manual_benchmark(file_pathway::String, n_iter = 10)
times = Vector()
for i in 1:n_iter
t1 = now()
mtld = get_mtld(file_pathway::String)
elapsed_time = (now() - t1).value / 1000
push!(times, elapsed_time)
println("Julia: MTLD = $mtld, which took $elapsed_time seconds")
end
return times
end
### main ###
# file_pathway = "/pathway/to/filename.txt"
file_pathway = "/Users/ekb5/Downloads/input/0a0HuaT4Vm7FoYvccyRRQj.txt"
# file_pathway = "/Users/ekb5/Downloads/delete.txt"
# file_pathway = "/Users/ekb5/Downloads/big_badboy.txt"
n_iter = 1
times = manual_benchmark(file_pathway, n_iter)
times_mean = mean(times)
times_median = median(times)
# println("\nOver $n_iter iterations, Julia: mean = $times_mean; median = $times_median\n")