Statistical-Machine-Translation/preprocess.m at master · ChoJohn/Statistical-Machine-Translation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
function outSentence = preprocess( inSentence, language )
%
%  preprocess
%
%  This function preprocesses the input text according to language-specific rules.
%  Specifically, we separate contractions according to the source language, convert
%  all tokens to lower-case, and separate end-of-sentence punctuation
%
%  INPUTS:
%       inSentence     : (string) the original sentence to be processed
%                                 (e.g., a line from the Hansard)
%       language       : (string) either 'e' (English) or 'f' (French)
%                                 according to the language of inSentence
%
%  OUTPUT:
%       outSentence    : (string) the modified sentence
%
%  Template (c) 2011 Frank Rudzicz

  global CSC401_A2_DEFNS

  % first, convert the input sentence to lower-case and add sentence marks
  inSentence = [CSC401_A2_DEFNS.SENTSTART ' ' lower( inSentence ) ' ' CSC401_A2_DEFNS.SENTEND];

  % trim whitespaces down
  inSentence = regexprep( inSentence, '\s+', ' ');

  % initialize outSentence
  outSentence = inSentence;

  % perform language-agnostic changes
  % TODO: your code here
  %    e.g., outSentence = regexprep( outSentence, 'TODO', 'TODO');

  switch language
   case 'e'
    % TODO: your code here
    Disp(outSentence)
   case 'f'
    % TODO: your code here
    Disp(outSentence)
  end

  % change unpleasant characters to codes that can be keys in dictionaries
  outSentence = convertSymbols( outSentence );