-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.m
More file actions
46 lines (38 loc) · 1.42 KB
/
preprocess.m
File metadata and controls
46 lines (38 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
function outSentence = preprocess( inSentence, language )
%
% preprocess
%
% This function preprocesses the input text according to language-specific rules.
% Specifically, we separate contractions according to the source language, convert
% all tokens to lower-case, and separate end-of-sentence punctuation
%
% INPUTS:
% inSentence : (string) the original sentence to be processed
% (e.g., a line from the Hansard)
% language : (string) either 'e' (English) or 'f' (French)
% according to the language of inSentence
%
% OUTPUT:
% outSentence : (string) the modified sentence
%
% Template (c) 2011 Frank Rudzicz
global CSC401_A2_DEFNS
% first, convert the input sentence to lower-case and add sentence marks
inSentence = [CSC401_A2_DEFNS.SENTSTART ' ' lower( inSentence ) ' ' CSC401_A2_DEFNS.SENTEND];
% trim whitespaces down
inSentence = regexprep( inSentence, '\s+', ' ');
% initialize outSentence
outSentence = inSentence;
% perform language-agnostic changes
% TODO: your code here
% e.g., outSentence = regexprep( outSentence, 'TODO', 'TODO');
switch language
case 'e'
% TODO: your code here
Disp(outSentence)
case 'f'
% TODO: your code here
Disp(outSentence)
end
% change unpleasant characters to codes that can be keys in dictionaries
outSentence = convertSymbols( outSentence );