-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordBagPack.m
31 lines (31 loc) · 1.27 KB
/
wordBagPack.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
function [wordBag, nGrams] = wordBagPack(preprocessedText, options)
% WORDBAGPACK Prepares bag-of-words and bag-of-n-Grams from text
% [wordBag, nGrams] = wordBagPack(preprocessedText,options)
%
% Wrapper for MATLAB bagOfWords and bagOfNgram functions
%
% ====INPUT=====
% preprocessedText tokenizedDoc tokenized text of a document
%
% -options-
% nWordRare integer max. occurence to remove rare words
% nGramRare integer max. occurence to remove rare nGrams
% nGramLength integer length of n-Grams, i.e. '2' leads to
% bi-Grams, such as 'neuron doctrine'
%
% ====OUTPUT====
% wordBag bagOfWords Bag of words from tokenized text
% biGrams bagOfNgrams Bag of N-Grams from tokenized text
arguments
preprocessedText tokenizedDocument
options.nWordRare {mustBeInteger} = 3
options.nGramRare {mustBeInteger} = 2
options.nGramLength {mustBeInteger} = 2
end
% Bag of Words
wordBag = bagOfWords(preprocessedText);
wordBag = removeInfrequentWords(wordBag, options.nWordRare);
% Bag of N-Grams
nGrams = bagOfNgrams(preprocessedText, 'NgramLengths', options.nGramLength);
nGrams = removeInfrequentNgrams(nGrams, options.nGramRare);
end