-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslator.js
3078 lines (3033 loc) · 200 KB
/
translator.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// ////////////////////////// TRANSLATOR ///////////////////////////////////
// As the text versions of the pdf files are generated, they need to be "repaired" because
// of frequent poor conversions (see repairPdf2TextConversion for details). The repairs are
// done by searching for the word(s) with the largest possible number of characters within
// the text, starting first with "priority 1" words of that length, then "priority 2", and
// so on. Identified words are flagged as being characters associated with a word to remove
// them from the pool of available characters for future word id's. Then words of a size
// that is 1 character smaller are searched in the same way. This process is repeated with
// words of decreasing size (based in the "nChars" field in the translator) until no words
// can be identified. The Oxford dictionary API is called as the last step in each of these
// phases to make sure that a dictionary word of the desired length can help id a group of
// characters before moving on to next smallest word.
// The function of the translator is to:
// * provide words that might not be found in a formal
// * identify phrases that hold special technical meaning (like an acronym)
// * provide supplemental linkage between these kinds of words and other words, to group
// similar concepts or topics together so that search results are comprehensive.
//
// The bulk of the translator (hereafter, the xLtr) is to consider all the ways that a given
// word, jargon or phrase might be presented in text, and to identify it as the same word. For
// example, the following are all the same thing: "de Vauc profile", "r^1/4", "r^0.25". The
// xLtr will identify each of these and insure that their appearances in the text are all
// documented under a common index word or words. In this particular case, for examplem the
// locations of the presence of these words would be docmented under "deVauc" and under "profile",
// such that a search of just "profile" would result in these locations, or just "deVauc".
// The ability to conduct extended searches is also built into the xLtr and the index: in an
// extended search, for example, one might also want to see the appearances of "elliptical galaxy"
// or "bulge" when searching on "deVauc". Such results would be included for a search on "deVauc"
// if the "extended search" option is set, as the phrase "elliptical galaxy" and "bulge" are
// in the "x" field of the xLtr entry for "deVauc" and for "profile".
//
// The xLtr and index work together as a team. All text, whether from a pdf file or entered by
// a user as search, is first fed into the xLtr. The xLtr determines whether any of the words
// are technical words that are prescribed in the xLtr, and if so, tells the program calling xLtr
// under what index the words should be documented or searched for. The xLtr is generated
// in real time. The index is written as an external file and modified as needed.
//
// xLtr is an array object with the following construction:
// .nChars: length of each word entry, counting only alphanumeric characters (e.g.,
// ignoring any characeters specific for regexp commands, like \d\d would be 2 characters, not 4.)
// .reg: regular expression used for searching for the term. In some cases, .reg
// will just be the word itself. In other cases, the .reg can include captured groups that would then
// be used to construct index entries, along with special code words provided in the .indx (see below)
// that gave instructions as to what to do with the captured groups. For example, for spectral lines,
// the .reg might be something like "\\[CII\\](5456.556)angstroms", which instructs to capture the
// wavelength so that that number can be used for a related purpose.
// .indx: list of words (delimited by underscores) that are to go into the index and
// be linked to the position of the word of interes. At a minimum, this list would be the astronomy
// term itself. All .indx fields must be a function that takes the text, the reg and the starting position
// and returns the word that should be used for the index. If captured groups were used in
// the .reg, then .indx can also include code words that instruct what to do with the captured groups.
// For example, if the wavelength of a spectral line were captured, then one of the .indx words might be
// "ang2eV", which tells the code to take the captured group (the wavelength value), convert to units
// of energy (eV) and then store that number in the index.
// If words in the .indx have an astericks in front of them, those words are not searched for in the
// dictionary for additional root words and inflections. Instead, astericked words go straight into
// the index as-is.
// .x: the "extended search" word list to make future queries provide more comprehensive results.
// For example, "surface_brightness" might be uncluded with "devauc profile". Or
// "carbon_forbidden_spectral_line_singly_ionized" along with [CII]5454. When an extended search is
// requested, the code will look up the words listed under '.x' and include the positions of those
// words in the results as well. If phrases are listed in .x (words delimited by underscore), then
// the results of that phrase are the locations for which words delimited by underscore
// are all present. Multiple phrases can also be provided in .x, delimited by vertical bars. So
// for example, "surface_brightness" would result in locations for which both "surface" and "brightness"
// were present in the text. If .x was "surface_brightness|inside_out_profile", then the locations
// common to "surface" and "brightness" would be one piece of the search results, and another piece
// would be the locations common to "inside", "out" and "profile". (e.g., The results would NOT be
// where "surface", "brightness", "inside", "out" profile" all coexisted, unless the .x were provided
// as "surface_brightness_inside_out_profile" or "surface|brightness|inside|out|profile".)
// .xSupp: tells the calling code that other index words should use the current index word (possibly along
// with other words in a phrase) as their .x fields. When the calling code sees a ".xSupp", the
// information in this field is placed in the .x fields of the prescribed words. If a word does not
// yet exist in the index, a placeholder in the index is made for that word so that its .x field
// can be populated. Note that there is no .xSupp field in the index itself, only a .x field.
// .priority: provides the code using the xLtr with a way to prioritize matching to text to avoid a
// situation in which a lower priority (meaning higher number) template was matched, taking away
// characters that may have been invovled in a jargon word having a smaller number of characters.
// .type:
// .endMatch:
// Note that when any of these words make their way into the final index, the primary index entry might be
// a different form than what the xLtr specified. For example, "spectral" might turn into a base word of
// "spectrum", "singly" might become "single" and "ionized" becomes "ion" so that the index entry for "ion"
// points to all places in which "ionized", "ion", "ionization", ionizes", etc is mentioned.
// ///////////////////////////////////////////////////////////////////////////////////
// do not match the word if it begins a sentence (if the word is at beginning or if preceded by a period then whitespace).
var NNNNNdotN = /(\d\d\d\d\d\.\d+)/.source; // (12345.6789)
var NNNNN = /(\d\d\d\d\d)/.source; // (12345);
var NNNNdotN = /(\d\d\d\d\.\d+)/.source; // (1234.6789)
var NNNN = /(\d\d\d\d)/.source; // (1234);
var NNNdotN = /(\d\d\d\.\d+)/.source; // (123.6789)
var NNN = /(\d\d\d)/.source; // (123);
var NNdotN = /(\d\d\.\d+)/.source; // (12.6789)
var NN = /(\d\d)/.source; // (12);
var NdotN = /(\d\.\d+)/.source; // (1.6789)
var N = /(\d)/.source; // (1);
var charge = ['', /\-/.source, /\+/.source, /\-\-/.source, /\+\+/.source,/\d\+/.source, /\d\-/.source];
// above is to replicate something like H2O2-, Na+, etc.
var chargeDesc = ['', 'anion', 'singly_ionized', 'anion', 'doubly_ionized', 'ionized', 'anion'];
// The below tries to capture transition information, such as seen in molecules (H2O[2-1])
var levels = [/\[(\d)\-(\d)\]/.source, /\[(\d\d)\-(\d)\]/.source, /\[(\d)\-(\d\d)\]/.source, /\[(\d\d)\-(\d\d)\]/.source,
/\((\d)\-(\d)\)/.source, /\((\d\d)\-(\d)\)/.source, /\((\d)\-(\d\d)\)/.source,/\((\d\d)\-(\d\d)\)/.source,
/\[(\d)\,(\d)\]/.source, /\[(\d\d)\,(\d)\]/.source, /\[(\d)\,(\d\d)\]/.source, /\[(\d\d)\,(\d\d)\]/.source,
/\((\d)\,(\d)\)/.source, /\((\d\d)\,(\d)\)/.source, /\((\d)\,(\d\d)\)/.source,/\((\d\d)\,(\d\d)\)/.source];
var atomicNum = ['', /\d/.source, /\d\d/.source, /\d\d\d/.source];
var ionLevel = ['','I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII',
'XIV','XV','XVI','XVII','XVIII','XIX','XX','XXI','XXII','XXIII',
'XXIV','XXV','XXVI','XXVII','XXVIII','XXIX','XXX'];
var ionLevels = ['',/[IVX]{1,6}/.source];
var ionDesc = ['','neutral','singly_ionized','doubly_ionized'];
for (var i = 4; i < ionLevel.length; i++) {ionDesc.push('ionized');}
const light = 2.99792458*Math.pow(10,8); // speed of light (m/s)
const planckEv = 6.626068/1.602*Math.pow(10,-15); // planck's constant (eV units)
i = 0;
var j = 0;
var tmp = '';
var xLtr = []; // as the below functions are called, this array will be populated
// load up acronym-finding syntax:
findAcronyms();
// load up bibliography/reference-finding syntax:
findBibRefs();
// Load up coordinate-finding syntax:
coordinates();
// Load up astronomy jargon-finding:
readInFile('astroTerms', '1', 'jargon');
// COME BACK HERE
// the survey/mission names still needs work in order to be compiant with new format for these files!!!
// Load up survey and mission name-finding:
// readInFile('surveysMissions', '1', 'jargon');
// Load up astronomy journal names
journalNames();
// Load up mis-spelled word-finding:
readInFile('misspelled', '3', 'substitute');
// Load up ability to detect British spelling and redirect to US spelling:
readInFile('britishAmerican', '2', 'substitute');
// Load up ablility to detect contracted words and separate them out into their constituate words:
readInFile('contractions', '2', 'substitute');
// Load up ability to identify molecular and atomic/ionic spectral lines
chemistry();
// Load up ability to detect common names for spectral lines and link them to their numerical names
commonLines();
// Load up references to photometry:
photometry();
// Check the tranlator for inconsistencies and redundancies:
//xLtrCheck();
// the translator is done, consider this a one-time-only call for the calling code.
// Note that if the translator is stringified and/or saved in some fashion, all the functions
// embedded in "indx" will be lost.
// COME BACK TO
// make sure that can recognize declination written as DDd
// ============================ findAcronyms =============================
// %%%%%%%%%%%%%%%%%%%%%%%% passed 4/11/2019
function findAcronyms() {
// Searches text for defined acronyms by looking for text that contains capitalized letters that then are
// repeated following the text (or that precede the text), for example:
// "we observed White Dwarf Stars (WDS), ..." or "we observed WDS (White Dwarf Stars), ...."
// If such are identified, the acronym is placed in a special category within the index that is not used as
// an ordinary index word, but rather as a "re-router" to the words for which the acronym stands for. In the
// index, the acronym is entered as an entry but with a type=acro, and another field that no other entry has,
// "acroDef", will hold the words that the acronym takes the place of. The words are delimited by "_" with
// the abbreviated paper ID in front, ie "454|white_dwarf_star". Every time the acronym is found in that paper,
// the all the words in standsFor will be uodated with the location information. If the acronym is found in
// another paper that does NOT have its own acronym entry, let's say for example that paper 334 also mentioned
// WDS but fails to define them. If WDS has **no** alternative meaning in the xLtr, but a match is made (but
// from the acronum of another paper), then that paper will inherit the knowledge of paper 454 and have all occurances
// of "WDS" be indexed to "white_dwarf_star". KIf there are multiple definitions of WDS within the "acronum" index
// entry and the paper fails to have its own definition, then the characters will remain un-identified. OK, the
// function here is just to find those acronyms with their definitions ... the main program will make sense of it all!
xLtr.push({"type":"acro", "priority":"1",
"indx":function(text, startPos) {
this.endMatch = "-1";
this.startDef = "-1";
this.endDef = "-1";
var smallWords = ['aka','al','am','an','and','are','as','at','be','by','do','eg','et','etal','etc',
'go','he','ie','if','in','io','is','it','me','my','no','ok','on','or','ox','pi',
'qi','so','to','we','xi'];
var linkedTo = [];
var from = [];
var i = 0;
var j = 0;
var k = 0;
var a1 = 0;
var a2 = 0;
var w1 = 0;
var w2 = 0;
var tst = '';
var t = '';
var endMatch = -1;
var acroPos1 = [];
var wordsPos1 = [];
var acroPos2 = [];
var wordsPos2 = [];
var acroPos = [];
var wordsPos = [];
var aPos = [];
var wPos = [];
var aTmp = '';
var wTmp = '';
var dist = [];
var alength = [];
var acro = '';
var acroDef = '';
var fullAcro = false;
var twoWordMin = false;
var noCherryPicking = true;
var noSkippedWords = true;
var caseMatch = false;
var acroCase = false;
var twoChars = false;
var notShortWord = false;
var notSymbol = false;
var startSentence = false;
// Strip out text starting from startPos to the location of a period, question mark, exclamation mark to the right
// of startPos.
var txt = text.slice(startPos);
var tmp = txt.match(/(?:[\.\?\!])(?:(?: [A-Z])|(?: $)|(?:$))/);
if (tmp) {txt = txt.slice(0,tmp.index); }
// filter the text, eliminating everything exept alpha-numeric and whitespace
tmp = JSON.parse(filterTheText('Aa0 ',txt));
txt = tmp[0];
txtPos = tmp[1];
// Convert text to an array of characters.
txt = txt.split('');
// make another array of same length as txt that assigns a word id to each letter. Any non-alphanumeric characters
// take on the word ID of the character that is to the left of them.
var wordIds = [];
wordIds.push(0);
for (i = 1; i < txt.length; i++) {
if (txt[i-1].match(/ /) && txt[i].match(/[^ ]/)) {
wordIds.push(wordIds[i-1]+1); // start new word
} else {
wordIds.push(wordIds[i-1]); } // retain same word id as prev. character
}
// construct an array similar to wordIds, but one that records the position where the word started rather than
// a sequence of incrementing values
var wordStarts = [];
wordStarts.push(0);
for (i = 1; i < txt.length; i++) {
if (txt[i-1].match(/ /) && txt[i].match(/[^ ]/)) {
wordStarts.push(i); // start new word
} else {
wordStarts.push(wordStarts[i-1]); } // retain same value as prev. character
}
for (i = 0; i < txt.length-1; i++) {
if (txt[i].match(/[A-Za-z0-9]/)) {
// get all the text to the right of the ith character
tmp = txt.slice(i+1).reduce(function(x1,x2,x3) {
// locate all the matches in the text with this ith character. At this point, the match is case-insensitive
if (x2.match(/[A-Za-z0-9]/) && wordStarts[x3+i+1] > wordStarts[i] &&
x2.toLowerCase() == txt.slice(i,i+1)[0].toLowerCase()) {x1.push(x3+i+1);} return x1;},[]);
if (tmp.length > 0) {
linkedTo.push(tmp);
from.push(i); }
}
}
// - - - - - - - - - - - - - - - - - - - - - - - -
if (linkedTo.length == 0) {return ''; }
// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
// Now map out all the possible "paths" by which an acronym's letters could be matched up with
// legitimate characters from preceeding word definitions:
for (i = 0; i < linkedTo[0].length; i++) {
if (wordStarts[linkedTo[0][i]] == linkedTo[0][i]) {
acroPos1.push([linkedTo[0][i]]);
wordsPos1.push([from[0]]); }
}
for (i = 1; i < linkedTo.length; i++) { // step through each character in the test, from left to right
aPos = [];
wPos = [];
for (j = 0; j < linkedTo[i].length; j++) {
for (k = 0; k < acroPos1.length; k++) {
// need to make combinations for each of these with all the acroPos that has been rolled up
// to this point. In order to be included, the jth value in the ith linkedTo needs to meet
// the following criteria:
// * characters associated with the acronym must be all be associated with the same "word" AND occur sequentially.
// * all definition words must have their first letter involved in the acronym (which may be upper or lowercase).
// * if there are uppercase letters somewhere in the middle of a word, those letters must also appear the acronym
// as well as the first character of that word.
// * letters other than the first letter and uppercase letters within definition words are allowed so long as the
// letter(s) to the left of them in the word are also present in the acronym.
aTmp = acroPos1[k];
aTmp = aTmp[aTmp.length-1];
wTmp = wordsPos1[k];
wTmp = wTmp[wTmp.length-1];
if ( (wordStarts[linkedTo[i][j]] == wordStarts[aTmp] && // acronym within single word
linkedTo[i][j] == aTmp + 1 && // sequential acronym letters
wordStarts[from[i]] < wordStarts[aTmp] && // different location from acronym
from[i] > wTmp) && // char in def must be right of prev char
((wordStarts[from[i]] == from[i]) || // first letter of a word ... or ...
(txt.slice(from[i],from[i]+1)[0].match(/[A-Z0-9]/) && // is uppercase char or number that ...
wordStarts[from[i]] == wordStarts[wTmp]) || // is part already-represented word ... or...
(from[i] == wTmp + 1)) ) { // is part of sequence w/ one of the above 2 cases
aPos.push(acroPos1[k].concat([linkedTo[i][j]]));
wPos.push(wordsPos1[k].concat([from[i]])); }
}
}
if (aPos.length > 0) {
acroPos1 = acroPos1.concat(aPos);
wordsPos1 = wordsPos1.concat(wPos); }
}
acroPos2 = [];
wordsPos2 = [];
for (i = 0; i < linkedTo[0].length; i++) {
if (wordStarts[linkedTo[0][i]] == linkedTo[0][i]) {
wordsPos2.push([linkedTo[0][i]]);
acroPos2.push([from[0]]); }
}
for (i = 1; i < linkedTo.length; i++) { // step through each character in the test, from left to right
aPos = [];
wPos = [];
for (j = 0; j < linkedTo[i].length; j++) {
for (k = 0; k < acroPos2.length; k++) {
aTmp = acroPos2[k];
aTmp = aTmp[aTmp.length-1];
wTmp = wordsPos2[k];
wTmp = wTmp[wTmp.length-1];
if ( (wordStarts[from[i]] == wordStarts[aTmp] && // acronym is one word
from[i] == aTmp + 1 && // sequential acronym letters
wordStarts[linkedTo[i][j]] > wordStarts[aTmp] && // different location from acronym
linkedTo[i][j] > wTmp) && // the next char in def to right of last one
((wordStarts[linkedTo[i][j]] == linkedTo[i][j]) || // first letter of a word ... or ...
(txt.slice(linkedTo[i][j],linkedTo[i][j]+1)[0].match(/[A-Z0-9]/) && // is uppercase char or number that ...
wordStarts[linkedTo[i][j]] == wordStarts[wTmp]) || // is part of already-represented word ... or ...
(linkedTo[i][j] == wTmp + 1))) { // is part of sequence w/ one of the above 2 cases
aPos.push(acroPos2[k].concat([from[i]]));
wPos.push(wordsPos2[k].concat([linkedTo[i][j]]) ); }
}
}
if (aPos.length > 0) {
acroPos2 = acroPos2.concat(aPos);
wordsPos2 = wordsPos2.concat(wPos); }
}
// combine the findings from both kinds of searches
acroPos = acroPos1.concat(acroPos2);
wordsPos = wordsPos1.concat(wordsPos2);
// We can immediately weed out any 1-element entries:
acroPos = acroPos.filter(z => z.length > 1);
wordsPos = wordsPos.filter(z => z.length > 1);
// Now test any found matches to insure compliance with other constraints:
// * [fullAcro] each character within the group of chars associated with the acronym must have a counterpart in the def words
// * [twoWordMin] there must be at least 2 definition words
// * [noCherryPicking] there cannot be any words larger than 3 letters laying between def words
// * [noSkippedWords] there cannot be more than 3 words of length greater than 3 characters between the end of the
// definition words and the beginning of the acro
// * [caseMatch] If the acronym has a mixture of lower/upper case characters, then there must be an exact case match
// to those corresponding letters in the definition words. Likewise, if the definition words has a mixture of
// cases, then the acronym must provide an exact character-to-character case match, with the following exception:
// if the only uppercase letter in the definition words is the very first letter (e.g., likely the beginning of
// a sentence), and the acronym does NOT have a case-mixture, then a case-match is irrelevant. If the acronym
// has all caps or all lowercase characters, a case-match is irrelevant so long as the definition words do
// not have a case-mixture (disregarding the case of the first letter in the def. words).
// * [acroCase] If the acronym has any uppercase letters, there must be more uppercase than lowercase. If the
// acronym has only 2 characters, both must be uppercase if one of them is.
// * [twoChars] if the acronym is only 2 letters, special precautions must be taken to insure that it is not just an ordinary
// 2-letter word (like "to" or "so" or "at"). The 2-letter acronym must either consist of all consonants or
// all vowels. Note that this constraint could likely remove viable acronyms from the index, but the risk of
// false-positive matches is just too high to accept without imposing such rules.
// * [notShortWord] acronym can't be among the hardwired list of common "small words" (like "etc")
// * [notSymbol] acronym can't be mistaken for a chemical symbol (like "Ne" or "He")
for (i = 0; i < acroPos.length; i++) {
wTmp = wordsPos[i].map(z => txt.slice(z,z+1)[0]).join('');
aTmp = acroPos[i].map(z => txt.slice(z,z+1)[0]).join('');
fullAcro = false;
twoWordMin = false;
noCherryPicking = true;
caseMatch = false;
acroCase = false;
twoChars = false;
notShortWord = false;
notSymbol = false;
noSkippedWords = true;
startSentence = false;
// get the length of the character grouping associated with the acronym itself by finding in the word ID all matches
// to the word ID value that the acronym characters have:
tmp = wordIds.reduce(function(x1,x2,x3) {
if (x2 == wordIds[acroPos[i][0]] && txt.slice(x3,x3+1)[0].match(/[A-Za-z0-9]/)) {x1.push(x2);} return x1;},[]);
if (tmp.length == acroPos[i].length) {fullAcro = true; }
// See if an uppercase letter exists in the original text just before or just after the identified acro. If so, and it was skipped
// over, then fullAcro gets turned back to false:
if (startPos > 0 && text.charAt(txtPos[acroPos[i]]).match(/[A-Z]/)) {fullAcro = false; }
if (startPos < text.length-1 && text.charAt(txtPos[Math.max(... acroPos[i])]).match(/[A-Z]/)) {fullAcro = false; }
// get a list of all the word IDs associated with the definition words:
tmp = wordsPos[i].map(z => wordIds[z]);
if (tmp !== undefined && tmp && tmp.length > 0 && ([... new Set(tmp)]).sort().length >= 2) {twoWordMin = true;}
// If these definition word IDs are not consequetive, determine how long the words are that are missing from this list
tst = [];
if (twoWordMin) {
for (j = Math.min(... tmp)+1; j < Math.max(... tmp); j++) {
if (tmp.indexOf(j) == -1) {tst.push(j); }
}
}
for (j = 0; j < tst.length; j++) {
tmp = wordIds.reduce(function(x1,x2,x3) {
if (x2 == j && txt.slice(x3,x3+1)[0].match(/[A-Za-z0-9]/)) {x1.push(x2);} return x1;},[]);
if (tmp.length <= 3) {noCherryPicking = false; }
}
// determine the range of characters between the end of the definition words and the acronym:
tmp = acroPos[i].length;
a1 = acroPos[i][tmp-1];
w1 = wordsPos[i][0];
a2 = acroPos[i][0];
tmp = wordsPos[i].length;
w2 = wordsPos[i][tmp-1];
tmp = '';
if (wordIds[a1] < (wordIds[w1]-1)) {
tmp = txt.reduce(function(x1,x2,x3) {
if (wordIds[x3] > wordIds[a1] && wordIds[x3] < wordIds[w1]) {x1.push(x2);} return x1;},[]);
tmp = tmp.join('').replace(/[^A-Za-z0-9 ]/g,'').replace(/ +/g,' ').trim();
} else if (wordIds[w2] < (wordIds[a2]-1)) {
tmp = txt.reduce(function(x1,x2,x3) {
if (wordIds[x3] > wordIds[w2] && wordIds[x3] < wordIds[a2]) {x1.push(x2);} return x1;},[]);
tmp = tmp.join('').replace(/[^A-Za-z0-9 ]/g,'').replace(/ +/g,' ').trim(); }
// If any of the in-between words had uppercase letters, then the test is failed:
if (tmp.match(/[A-Z]/)) {noSkippedWords = false;}
tmp =tmp.split(' ');
tmp = tmp.filter(z => z.length > 3); // don't count words of 3 characters or less
if (tmp.length > 3) {noSkippedWords = false; } // if more than 3 substantial words lay between acro and def, fail the test
// For the below tests, need to determine if the definition words start at the beginning of sentence.
if (startPos == 0) {
startSentence = true;
} else if (text.slice(0,startPos).trim() == '') {
startSentence = true;
} else if (startPos >= 2 && text.slice(startPos-2,startPos).trim() == '\.') {
startSentence = true; }
if (aTmp == wTmp) {caseMatch = true; }
if (aTmp.match(/^[A-Z0-9]+$/) && wTmp.match(/^[a-z0-9]+$/)) {caseMatch = true;}
// If the definition words is a mix of cases that involves more than a capitalization of the start of a sentence,
// do definition word characters case-match with the acronym characters?
if (startSentence && aTmp.slice(1) == wTmp.slice(1)) {caseMatch = true; }
// check the case of the acronym characters, make sure there is consistency
if (aTmp.match(/^[A-Z0-9]+$/) || aTmp.match(/^[a-z0-9]+$/)) {acroCase = true; }
if (aTmp.match(/[A-Z]/) && aTmp.match(/[a-z]/) && aTmp.match(/[A-Z]/g).length > aTmp.match(/[a-z]/g).length) {
acroCase = true; }
// Now check the acronym length:
if (aTmp.length > 2) {twoChars = true;}
if (!twoChars) {
// If the acronym consists of all consonants or of all vowels, then it passes the twoChar test:
tmp = acroPos[i].reduce(function(x1,x2,x3) {
if (txt.slice(x3,x3+1)[0].match(/[aeiou]/i)) {x1.push('v')} else {x1.push('c')}; return x1;},[]);
if (tmp.length > 0 && ([... new Set(tmp)]).length == 1) {twoChars = true;} }
// Make sure that acronym does not match any of the common short words:
if (smallWords.indexOf(aTmp) == -1) {notShortWord = true;}
// Now check that the acronym is not actually a chemical symbol!
tmp = xLtr.findIndex(z => z.reg !== undefined && z.symbol !== undefined && z.indx(aTmp,0) != '');
if (tmp == -1) {notSymbol = true;}
// Now tally up the scores and see if this acronym candidate failed ANY of the tests:
if (!(fullAcro*twoWordMin*noCherryPicking*noSkippedWords*caseMatch*acroCase*twoChars*notShortWord*notSymbol)) {
acroPos[i] = [-1];
wordsPos[i] = [-1]; }
}
// Remove any -1 values:
acroPos = acroPos.filter(z => z[0] != -1);
wordsPos = wordsPos.filter(z => z[0] != -1);
// If by now, there are more than 1 possibility for acronym and corresponding definition, then select whichever has the longest acronym.
// If the acronym length is the same for all the matches, then select the one for which the words and the acronym are closest together.
dist = [];
alength = [];
for (i = 0; i < acroPos.length; i++) {
tmp = acroPos[i].map(z => txt.slice(z,z+1)[0]).join('');
alength.push(tmp.length);
if (acroPos[i][0] > wordsPos[i][0]) {
tmp = wordsPos[i].length;
dist.push(txt.slice(wordsPos[i][tmp-1]+1,acroPos[i][0]+1).join('').replace(/[^A-Za-z0-9 ]/g,'').length);
} else {
tmp = acroPos[i].length;
dist.push(txt.slice(acroPos[i][tmp-1]+1,wordsPos[i][0]+1).join('').replace(/[^A-Za-z0-9 ]/g,'').length); }
}
for (i = 0; i < acroPos.length; i++) {
if (alength[i] < Math.max(... alength)) {
acroPos[i] = [-1];
wordsPos[i] = [-1];
alength[i] = -1;
dist[i] = -1; }
}
acroPos = acroPos.filter(z => z[0] != -1);
wordsPos = wordsPos.filter(z => z[0] != -1);
dist = dist.filter(z => z != -1);
alength = alength.filter(z => z != -1);
tmp = dist.findIndex(z => z == Math.min(... dist)); // returns the first one to meet criteria
acro = '';
acroDef = '';
if (tmp != -1) {
acroPos = acroPos[tmp];
wordsPos = wordsPos[tmp];
acro = acroPos.map(z => txt.slice(z,z+1)[0]).join('');
tmp = [... new Set(wordsPos.map(z => wordStarts[z]))];
tmp = [Math.min(... tmp), Math.max(... tmp)];
tmp[1] = tmp[1] + wordStarts.filter(z => z == tmp[1]).length;
acroDef = txt.slice(tmp[0],tmp[1]).join('').replace(/[^A-Za-z0-9]/g,' ').trim();
this.startDef = '' + (txtPos[tmp[0]] + startPos);
this.endDef = '' + (txtPos[tmp[1]-1] + 1 + startPos);
acroDef = acroDef.replace(/ +/,' ').trim();
if (acroPos[0] > wordsPos[0]) {
this.endMatch = "" + (txtPos[Math.max(... acroPos)] + 1 + startPos);
} else {
this.endMatch = this.endDef; }
return acro + ' ' + acroDef.replace(/ /g,'\_');
} else {return ''; }
} });
return;
}
// ============================ end findAcronyms =============================
// ============================ findBibRefs ====================================
// %%%%%%%%%%%%%%%%%%%%%%%% passed 4/22/2019
function findBibRefs() {
// Searches text for citations/bibliographic entries by looking for text that has the format of a list of
// authors followed by publication year, journal name, volume number and page number.
// If such are identified, a short citation is constructed from the author list, pub year, etc and then the
// short citation is placed in a special category within the index that is not used as an ordinary index word,
// but rather as a "re-router" to the words for which the short citation stands for. In the index, the short
// citation is entered as an entry but with a type=citation, and another field that no other entry has,
// "fullCit", will hold the author last names, publication year, volume number, page number and journal name.
// All of these items are delimited by "_", and the abbreviated paper ID in front,
// ie "454|jones_smith_white_1997_the_astronomical_journal_676_8". Every time the short citation is found in
// that paper, all the words in fullCit will be updated with the location information.
xLtr.push({"type":"citation", "priority":"1",
"indx":function(text, startPos) {
this.endMatch = "-1";
this.authors = "";
this.pubYear = "";
this.journal = "";
this.page = "";
this.volume = "";
var authors = [];
var pubYear = '';
var journal = '';
var page = '';
var volume = '';
var journalAbb = '';
var tmp = '';
var t = '';
var m = '';
var t1 = '';
var t2 = '';
var shortCit = '';
// Strip out text starting from startPos
text = text.slice(startPos, startPos + 5000);
// From henceforth, need to preserve character positions because at the end, need to know the actual position
// of the end of the match to to a bibliographic reference, so that the text can be masked out, etc by the
// function calling this procedure.
// to reduce complications in identifying the bibliography, remove any Jr, Sr, I, II, etc from
text = text.replace(/([\, ]+)(jr\.?)([\, ]+)/ig, function(x,x1,x2,x3){return x3 + (x1+x2).replace(/[ -~]/g,' ');});
text = text.replace(/([\, ]+)(sr\.?)([\, ]+)/ig, function(x,x1,x2,x3){return x3 + (x1+x2).replace(/[ -~]/g,' ');});
text = text.replace(/([\, ]+)(i+\.?)([\, ]+)/ig, function(x,x1,x2,x3){return x3 + (x1+x2).replace(/[ -~]/g,' ');});
// replace "et al" in the same way:
text = text.replace(/([\, ]+)(et\.? *al\.?)([\, ]+)/ig, function(x,x1,x2,x3){return x3 + (x1+x2).replace(/[ -~]/g,' ');});
// replace "and" in the same way ... if there is not a comma, force one to be there
text = text.replace(/[\, ]+and[\, ]+/ig, function(x){return '\,' + x.slice(1).replace(/[ -~]/g,' ');});
// replace "&" in the same way:
text = text.replace(/[\, ]+\&[\, ]+/ig, function(x){return '\,' + x.slice(1).replace(/[ -~]/g,' ');});
// Occurances of hypenated names is a problem, like Smith-Jones. replace such occurances to be "Smithjones".
text = text.replace(/([A-Z][a-z]+)( *\- *)([A-Z][a-z]+)/g, function(x,x1,x2,x3) {
return x1.charAt(0).toUpperCase() + x1.slice(1).toLowerCase() + x3.toLowerCase() + x2.replace(/[ -~]/g,' ');});
// Occurances of names like O'Smith are a problem. Replace such occurances to be "Osmith"
text = text.replace(/([A-Za-z]+)( *\' *)([A-Za-z]+)/g, function(x,x1,x2,x3) {
return x1.charAt(0).toUpperCase() + x1.slice(1).toLowerCase() + x3.toLowerCase() + x2.replace(/[ -~]/g,' ');})
// If O'Smith got rendered as O Smith, try to catch and fix that situation as well by turning O Smith into "Osmith"
text = text.replace(/([A-Z])( *)([A-Z][a-z]+)/g, function(x,x1,x2,x3) {
return x1 + x3.toLowerCase() + x2.replace(/[ -~]/g,' ');})
// If there are characters that are all lowercase preceding a set of characters that start with a capital letter,
// scoot the lowercase characters into the other characters, so that "van Smith" becomes "Vansmith"
text = text.replace(/([\, ]+)([a-z]{2,5})( *)([A-Z][a-z]+)([\, ]+)/g, function(x,x1,x2,x3,x4,x5) {
return x1 + x2.charAt(0).toUpperCase() + x2.slice(1) + x4.toLowerCase() + x5 + x3; });
// There could be the case that "van Smith" was rendered as VanSmith, or that O'Smith rendered as OSmith. There can
// only be 1 captial letter per last name, or the below algorithm fails. Need to fix this kind of situation so that VanSmith
// turns into Vansmith and OSmith into Osmith. To fully cover all bases, find every word that starts with a lower case letter
// but has an uppercase letter somewhere later in the word, and force all characters to be lowercase except for the first
// letter, which is forced to be uppercase:
text = text.replace(/([\, ]+)([a-z]+)([A-Z])([A-Za-z]+)([\, ]+)/g, function(x,x1,x2,x3,x4,x5) {
return x1 + x2.charAt(0).toUpperCase() + x2.slice(1) + x3.toLowerCase() + x4.toLowerCase() + x5; });
// And now get any word starting with an uppercase letter but has other uppercase letters somewhere else in the word, turn
// unto all lower case except for first letter (note that we are about to really mess up any legitimate acronyms, but
// that's OK because these changes are not permanent to the text
text = text.replace(/([\, ]+)([A-Z]+)([a-z]+)([A-Z])([A-Za-z]*)([\, ]+)/g, function(x,x1,x2,x3,x4,x5,x6) {
return x1 + x2.charAt(0) + x2.slice(1).toLowerCase() + x3 + x4.toLowerCase() + x5.toLowerCase() + x6; });
// Remove any capital letters that stand in isolation -- those are likely to be initials. Note that initials are
// identified as being uppercase letters followed by a period (with possible white space bracketing the period).
text = text.replace(/([\, ]+)((?:[A-Z] *\. *){1,5})(\,? *[12]{0,1})/g, function(x,x1,x2,x3) {
var tmp = '\,' + (x1+x2).slice(1).replace(/[ -~]/g,' ');
return tmp + x3.replace(/[^0-9]/g,' '); });
// OK, now filter the text big-time, removing everything except letters, numbers and commas
t = JSON.parse(filterTheText(/\,/.source, text));
// Now start looking for groups of characters that look like references within the text: Name,Name,Name2022,JournalName,000,00
// where 000,00 is the volume and page numbers, respectively
m = t[0].match(/^((?:(?:[A-Z][a-z]+\,){1,20}(?:[A-Z][a-z]+)?)|(?:[A-Z][a-z]+))(?:([12]\d\d\d[abc]{0,1})\,([A-Z][A-Za-z]{1,100})\,?(\d+)\,(\d+))/);
authors = [];
pubYear = '';
journal = '';
volume = '';
page = '';
if (m) {
// if the original reference is Smith, A. S., Jones, T. E., and Miller, W. D 2002, Astron. J., 145, 1
// the filtered view would be Smith,Jones,Miller,2002,AstronJ,145,1 (note that the "and" and the initials would have already been
// removed in a previous step above this "while" loop). Therefore, tmp will be (Smith,Jones,Miller)(2002)(AstronJ)(145)(1)
// extract the publication year:
pubYear = m[2].trim();
// extract the volume number:
volume = m[4];
// get the page numner:
page = m[5].replace(/[a-zA-Z]/g,''); // remove any "L" or other such designations, turn into pure number
// extract the journal name and convert into the journal abbreviation, but extract from the unfiltered text:
t1 = t[1][m[1].length + m[2].length - 1] + 1;
t2 = t[1][m[1].length + m[2].length + m[3].length -1] + 1;
journal = text.slice(t1,t2).replace(/[^A-Za-z0-9 ]/g,'').trim();
// determine what the full name is for this journal by consulting the xLtr's "journal" entries:
tmp = xLtr.reduce(function(x1,x2,x3) {
if (x2.type == "journal" && x2.abb !== undefined && x2.indx(journal,0) != "" &&
parseFloat(x2.endMatch) > x1[1]) {x1 = [x3,parseFloat(x2.endMatch),x2.name.toLowerCase()];} return x1;}, [-1,-1,'']);
if (tmp[0] != -1 && tmp[1] == journal.length) { // perfect match!
journal = tmp[2].split(' ').join('\_'); }
// extract the individual authors' last names:
authors = m[1].replace(/\,/g,' ').replace(/ +/g,' ').trim().split(' ');
// Now that we have the author names and publication year, put together the citation likely to appear in the text
shortCit = '';
// Now create a citation phrase... what this bibliography reference will likely look like in the text. For example, if the bibliography
// entry is Smith, A.K., Jones, Q. R., and Miller, D. R. 2010, ApJ, 545, 34, then the citation phrase might look like Smith et al 2010 or
// maybe Smith, Jones & Miller 2010. Use both formats just to be certain.
if (authors.length > 3) {
shortCit = authors[0] + 'etal' + pubYear;
} else if (authors.length == 3) {
shortCit = authors[0]+'etal'+pubYear+ '\|' +
authors[0]+authors[1]+'and'+authors[2]+pubYear + '\|' +
authors[0] + authors[1] + authors[2] + pubYear; // 3 possibilities
} else if (authors.length == 2) {
shortCit = authors[0] + 'and' + authors[1] + pubYear + '\|' +
authors[0] + authors[1] + pubYear; // 2 possibilities
} else if (authors.length == 1) {
shortCit = authors[0] + pubYear; }
pubYear = pubYear.replace(/[a-zA-Z]/g,'');
this.endMatch = '' + (t[1][m[0].length-1] + 1 + startPos);
this.authors = ([... new Set(authors)]).sort().join('\_');
this.pubYear = ""+pubYear;
this.journal = journal;
this.volume = volume;
this.page = page;
return shortCit.toLowerCase();
} else {return ''; }
} });
return;
}
// ============================ end findBibRefs ===========================================
// ------------------------------ journalNames ------------------------------------------
// %%%%%%%%%%%%%%%%%%%%%%%% passed 4/22/2019
// big help: https://stackoverflow.com/questions/1234712/javascript-replace-with-reference-to-matched-group
// http://aramis.obspm.fr/~coulais/BibTeX/aas_macros.sty, https://cdsads.u-strasbg.fr/abs_doc/aas_macros.html
// The format for the journal name is to spell out the full name, and to capitalize
// the parts of the name that are "required" to be present for a match and/or that
// compose the abbreviation for that journal. Note that to get all possible realistic variations of
// a journal name, sometimes the journal name variations need to be explicitly stated as separate
// entries. for example, "Astronomical Society of the Pacific" might be stated as simply
// ASP. The code will match the "A" to Astronomical, the "S" to the "s" that comes right after
// the A in Astronomical, and the "P" gets matched to the P in Pacific. The code will set match to
// "false", because the "S" in Society was not matched. To avoid this scenario, a second entry is
// needed that just states the ASP explicitly.
// Clean up the submitted name to be matched
function journalNames() {
// read in the external file and split out the individual rows of data:
// See https://mirror.hmc.edu/ctan/macros/latex/contrib/mnras/mnras_guide.pdf regarding journal letters
// that have a designation of "L" in front of the page numbers.
var lines = (GM_getResourceText("astroJournals").trim()).split('\n');
var iLine = 0;
var nextFields = [];
var fields = []; // holds the columns of data for a particular line in the file
var rWords = [];
var xWords = '';
var xSupp = '';
var nChar = 0;
var clipHere = [];
var indexMatch = -1;
var pubNames = [];
var pubAbbrev = [];
var pagePref = [];
var matchArr = [];
var i = -1;
var j = -1;
var k = -1;
var prevMatch = '';
var pubFull = '';
var findPub = '';
var eachWord = [];
var tmp = '';
var nLet = -1;
var reqLet = '';
var matchCnt = 0;
var sp = '';
// clip out data table from any comments blocks
clipHere = lines.reduce(function(x1,x2,x3) {if (x2.match(/\={20,}/)) {x1.push(x3);} return x1;}, []);
if (clipHere.length >= 2) {
lines = lines.slice(clipHere[0]+1,clipHere[1]);
} else if (clipHere.length == 1) {
lines = lines.slice(clipHere[0]+1); }
while (iLine < lines.length) {
lines[iLine] = (lines[iLine].trim().replace(/ +/g,' ')).trim(); // get rid of any repeated spaces
// split out each line in fileName.txt into separate columns/fields. Whitespace is the delimiter
fields = lines[iLine].split(' ');
// Turn any field that is equal to just "|" into ""
fields = fields.map(z => z.replace(/^\|$/,''));
// Clean up the full name (in case some undesirable characters were included):
fields[0] = fields[0].replace(/\&/g, " and ").replace(/\./g, " ").replace(/\W/g, " ").replace(/\_/g, " ").trim();
// construct the regex. Example: if full name is "the Astronomical Journal", then the regex that will
// capture all possibilities without allowing false-positives is the following:
// ^(?:(?:the\s)|(?:th\s)|t\s)?a\s?(?:(?:stronomical\s)|(?:stronomica\s)|(?:stronomic\s)|(?:stronomi\s)|(?:stronom\s)|
// (?:strono\s)|(?:stron\s)|(?:stro\s)|(?:str\s)|(?:st\s)|s\s)?j\s?(?:(?:ournal\s)|(?:ourna\s)|(?:ourn\s)|(?:our\s)|
// (?:ou)|o)?$
findPub = '';
// now start going thru each word in the full journal name. If the word has no required letters, then the entire
// word is optional and should end with a ?. If the word has required letters, allow the word to appear as
// illustrated in the following example for AstroPhysics:
// Astrophysics, Astrophysic, Astrophysi, Astrophys, Astrophy, Astroph, Astrop, Ap
// Note that we start dropping letters at the end up to the first required letter
// encoutered, and then the only additional allowed combo would be the required letters only
eachWord = fields[0].split(" ");
for (j = 0; j < eachWord.length; j++) {
sp = / /.source;
if (j == eachWord.length -1) {sp = '';}
tmp = '';
reqLet = '';
for (k = eachWord[j].length; k > 0; k--) {
if (eachWord[j].charAt(k-1) === eachWord[j].charAt(k-1).toUpperCase()) {
reqLet = eachWord[j].charAt(k-1).toLowerCase() + reqLet;}
if (!reqLet && k > 1) {
tmp = tmp + '(?:' + eachWord[j].substr(0,k).toLowerCase() + sp + ')|';
} else if (!reqLet && k == 1) {
tmp = '(?:' + tmp + eachWord[j].charAt(0).toLowerCase() + sp + ')?';}
// once you hit the first required letter from the right side of the word, then you
// stop building the regex, but continue collecting any remaining required letters in
// the word:
}
if (sp) {sp = sp + '?';}
if (reqLet.length > 1) {
tmp = '(?:' + tmp + '(?:' + reqLet.toLowerCase() + sp + '))';
} else if (reqLet) {
tmp = '(?:' + tmp + reqLet.toLowerCase() + sp + ')'; }
findPub = findPub + tmp;
}
xLtr.push({"reg":findPub, "type":"journal", "priority":"1",
"name":fields[0], "abb": fields[1], "pagePref": fields[2],
"indx":function(text, startPos) {
this.endMatch = "-1";
text = text.toLowerCase();
t = JSON.parse(filterTheText(this.reg, text));
m = t[0].match(new RegExp('\^' + this.reg));
if (m) {
// found a match to the journal name
this.endMatch = t[1][m[0].length-1] + 1 + startPos;
// return the full journal name (the abbreviation and page pref are accessible via the provided field names)
return this.name.toLowerCase();
} else {return ""; } }})
iLine = iLine + 1;
}
return;
}
// --------------------------- end journalAbbrev --------------------------------------
// ============ COORDINATES ==================
// %%%%%%%%%%%%%%%%%%%%%%%% passed 3/12/2019
function coordinates() {
// If the coordinates were originally written with colons, insure that the whole numbers
// have preceding zeros to make them be 2-digit values:
// NOTE: we do not filter the text to at least eliminate white space as we do for
// other technical terms because the presence of white space delineating the ra and dec
// in the absence of a "+" or "-" is essential in recognizing the text as a coordinate
// ------------------ (14)( : 5)( : 45)( . 876566)( )(76)( : 2)( : 15)( . 1234)
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:[0-9](?![0-9]))|(?:2[0-3])))/, // (14) 1
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( : 5) 2
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( : 45) 3 ?
/( *\. *[0-9]+)?/, // ( . 876566) 4 ?
/( +[\+\-]{0,1} *)/, // ( ) 5
/((?:(?:[0-8][0-9])|(?:[0-9](?![0-9]))))/, // (76) 6
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( : 2) 7
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( : 15) 8 ?
/( *\. *[0-9]+)?/ // ( . 1234) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"21", "type":"ra", "priority":"1", "x":"", "xSupp":"", "nVals":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[2];
return tmp[0];
} else {return ''; } } });
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:[0-9](?![0-9]))|(?:2[0-3])))/, // (14) 1
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( : 5) 2
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( : 45) 3 ?
/( *\. *[0-9]+)?/, // ( . 876566) 4 ?
/( +[\+\-]{0,1} *)/, // ( ) 5
/((?:(?:[0-8][0-9])|(?:[0-9](?![0-9]))))/, // (76) 6
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( : 2) 7
/( *\: *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( : 15) 8 ?
/( *\. *[0-9]+)?/ // ( . 1234) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"21", "type":"dec", "priority":"1", "x":"", "xSupp":"", "nVals":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[3];
return tmp[1];
} else {return ''; } } });
// ------------------ (04 hr)( 3 min)( 1 sec)( . 345 )( )(77 deg)( 35 min)( 5 sec)( . 11 )
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:[0-9](?![0-9]))|(?:2[0-3])) *(?:(?:hours)|(?:hour)|(?:hrs)|(?:hr)|(?:h)))/, // (04 hr) 1
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m))?)/, // ( 3 min?) 2
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:seconds)|(?:second)|(?:sec)|(?:s))?)?/, // ( 1 sec) 3 ?
/( *(?:\. *[0-9]+)? *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m)|(?:seconds)|(?:second)|(?:sec)|(?:s)))?/, // ( . 345) 4 ?
/( *[\+\-]{0,1}) */, // ( ) 5
/((?:(?:[0-8][0-9])|(?:[0-9](?![0-9]))) *(?:(?:textdegree)|(?:circ)|(?:degrees)|(?:degree)|(?:degs)|(?:deg)|(?:d)))/, // (77 deg) 6
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m))?)/, // ( 35 min?) 7
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:seconds)|(?:second)|(?:sec)|(?:s))?)?/, // ( 5 sec) 8 ?
/( *(?:\. *[0-9]+)? *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m)|(?:seconds)|(?:second)|(?:sec)|(?:s)))?/ // ( . 11 ) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"34", "type":"ra", "priority":"1", "x":"", "xSupp":"", "nVars":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[2];
return tmp[0];
} else {return ''; } } });
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:[0-9](?![0-9]))|(?:2[0-3])) *(?:(?:hours)|(?:hour)|(?:hrs)|(?:hr)|(?:h)))/, // (04 hr) 1
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m))?)/, // ( 3 min?) 2
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:seconds)|(?:second)|(?:sec)|(?:s))?)?/, // ( 1 sec) 3 ?
/( *(?:\. *[0-9]+)? *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m)|(?:seconds)|(?:second)|(?:sec)|(?:s)))?/, // ( . 345) 4 ?
/( *[\+\-]{0,1}) */, // ( ) 5
/((?:(?:[0-8][0-9])|(?:[0-9](?![0-9]))) *(?:(?:textdegree)|(?:circ)|(?:degrees)|(?:degree)|(?:degs)|(?:deg)|(?:d)))/, // (77 deg) 6
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m))?)/, // ( 35 min?) 7
/( *(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))) *(?:(?:seconds)|(?:second)|(?:sec)|(?:s))?)?/, // ( 5 sec) 8 ?
/( *(?:\. *[0-9]+)? *(?:(?:minutes)|(?:minute)|(?:mins)|(?:min)|(?:m)|(?:seconds)|(?:second)|(?:sec)|(?:s)))?/ // ( . 11 ) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"34", "type":"dec", "priority":"1", "x":"", "xSupp":"", "nVars":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[3];
return tmp[1];
} else {return ''; } } });
// ------------------ (14)( 5)( 45)( . 876566)( +)(76)( 2)( 15)( . 1234)
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:[0-9](?![0-9]))|(?:2[0-3])))/, // (14) 1
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( 5) 2
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( 45) 3 ?
/( *\. *[0-9]+)?/, // ( . 876566) 4 ?
/( +[\+\-]{0,1} *)/, // ( +) 5
/((?:[0-8][0-9])|(?:[0-9](?![0-9])))/, // (76) 6
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( 2) 7
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( 15) 8 ?
/( *\. *[0-9]+)?/ // ( . 1234) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"18", "type":"ra", "priority":"1", "x":"", "xSupp":"", "nVars":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[2];
return tmp[0];
} else {return ''; } } });
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:[0-9](?![0-9]))|(?:2[0-3])))/, // (14) 1
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( 5) 2
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( 45) 3 ?
/( *\. *[0-9]+)?/, // ( . 876566) 4 ?
/( +[\+\-]{0,1} *)/, // ( +) 5
/((?:[0-8][0-9])|(?:[0-9](?![0-9])))/, // (76) 6
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))/, // ( 2) 7
/( +(?:(?:[0-5][0-9])|(?:[0-9](?![0-9]))))?/, // ( 15) 8 ?
/( *\. *[0-9]+)?/ // ( . 1234) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"18", "type":"dec", "priority":"1", "x":"", "xSupp":"", "nVars":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[3];
return tmp[1];
} else {return ''; } } });
// ------------------ (14)(05)(45)(.876566)(-)(76)(15)(.1234) ===> '14:05:45.876556-76:15.1234'
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:2[0-3])))/, // (14) 1
/([0-5][0-9])/, // (05) 2
/([0-5][0-9])?/, // (45) 3 ?
/(\.[0-9]+)?/, // (.876566) 4 ?
/([\+\-])/, // (-) 5
/([0-8][0-9])/, // (76) 6
/([0-5][0-9])/, // (15) 7
/([0-5][0-9])?/, // () 8
/(\.[0-9]+)?/ // (.1234) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"18", "type":"ra", "priority":"1", "x":"", "xSupp":"", "nVars":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[2];
return tmp[0];
} else {return ''; } } });
xLtr.push({"reg": (new RegExp([
/(^(?:(?:[0-1][0-9])|(?:2[0-3])))/, // (14) 1
/([0-5][0-9])/, // (05) 2
/([0-5][0-9])?/, // (45) 3 ?
/(\.[0-9]+)?/, // (.876566) 4 ?
/([\+\-])/, // (-) 5
/([0-8][0-9])/, // (76) 6
/([0-5][0-9])/, // (15) 7
/([0-5][0-9])?/, // () 8
/(\.[0-9]+)?/ // (.1234) 9 ?
].map(z => z.source).join(''))).source,
"nChars":"18", "type":"dec", "priority":"1", "x":"", "xSupp":"", "nVars":"2",
"indx":function(text, startPos) {
this.endMatch = "-1";
var m = text.slice(startPos).match(new RegExp('^' + '(?:' + this.reg + ')'));
if (m) {
this.endMatch = ""+(startPos + m[0].length);
var tmp = extractRaDecVals(this.reg, text.slice(startPos));
this.accuracy = tmp[3];
return tmp[1];
} else {return ''; } } });
return;
}
// - - - - - - - - EXTRACTRADECVALS - - - - - - - -
// %%%%%%%%%%%%%%%%%%%%%%%% passed 3/12/2019
function extractRaDecVals(reg,txt) {
var charPos = [];
var raDeg, decDeg;
var r1, r2, r3, d1, d2, d3,s;
var r2Acc,r3Acc,d2Acc,d3Acc;
var raAcc,decAcc;
var findMatch = txt.match(new RegExp(reg));
if (findMatch) {
r1 = findMatch[1].replace(/[^0-9]/g,'');
r2 = findMatch[2].replace(/[^0-9]/g,'');
r2Acc = 0.5;
r3 = '';
if (findMatch[3]) {
r3Acc = 0.5;
r3 = findMatch[3].replace(/[^0-9]/g,''); }
if (findMatch[4] && r3 != '') {
r3Acc = Math.pow(10,-1.0*(findMatch[4].replace(/[^0-9]/g,'').length));
r3 = r3 + findMatch[4].replace(/[^0-9\.]/g,'');
} else if(findMatch[4]) {
r2Acc = Math.pow(10,-1.0*(findMatch[4].replace(/[^0-9]/g,'').length));
r2 = r2 + findMatch[4].replace(/[^0-9\.]/g,''); }
if (r3 == '') {r3 = '0'; }
s = 1;
if (findMatch[5].replace(/[^+-]/g,'') == '-') {s = -1; }
d1 = findMatch[6].replace(/[^0-9]/g,'');
d2 = findMatch[7].replace(/[^0-9]/g,'');
d2Acc = 0.5;
d3 = '';
if (findMatch[8]) {
d3Acc = 0.5;
d3 = findMatch[8].replace(/[^0-9]/g,''); }
if (findMatch[9] && d3 != '') {