-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathHtmlTokenizer.pas
More file actions
4750 lines (4334 loc) · 173 KB
/
HtmlTokenizer.pas
File metadata and controls
4750 lines (4334 loc) · 173 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
unit HtmlTokenizer;
interface
uses
ActiveX, Classes, SysUtils, Contnrs,
DomCore, Entities;
const
CP_UTF16 = 1200;
type
TTokenizerState = (
tsDataState, // DoDataState; //13.2.5.1 Data state
tsRCDataState, // DoRCDataState; //13.2.5.2 RCDATA state
tsRawTextState, // DoRawTextState; //13.2.5.3 RAWTEXT state
tsScriptDataState, // DoScriptDataState; //13.2.5.4 Script data state
tsPlaintextState, // DoPlaintextState; //13.2.5.5 PLAINTEXT state
tsTagOpenState, // DoTagOpenState; //13.2.5.6 Tag open state
tsEndTagOpenState, // DoEndTagOpenState; //13.2.5.7 End tag open state
tsTagNameState, // DoTagNameState; //13.2.5.8 Tag name state
tsRCDATALessThanSignState, // DoRCDATALessThanSignState; //13.2.5.9 RCDATA less-than sign state
tsRCDATAEndTagOpenState, // DoRCDATAEndTagOpenState; //13.2.5.10 RCDATA end tag open state
tsRCDATAEndTagNameState, // DoRCDATAEndTagNameState; //13.2.5.11 RCDATA end tag name state
tsRAWTEXTLessThanSignState, // DoRAWTEXTLessThanSignState; //13.2.5.12 RAWTEXT less-than sign state
tsRAWTEXTEndTagOpenState, // DoRAWTEXTEndTagOpenState; //13.2.5.13 RAWTEXT end tag open state
tsRAWTEXTEndTagNameState, // DoRAWTEXTEndTagNameState; //13.2.5.14 RAWTEXT end tag name state
tsScriptDataLessThanSignState, // DoScriptDataLessThanSignState; //13.2.5.15 Script data less-than sign state
tsScriptDataEndTagOpenState, // DoScriptDataEndTagOpenState; //13.2.5.16 Script data end tag open state
tsScriptDataEndTagNameState, // DoScriptDataEndTagNameState; //13.2.5.17 Script data end tag name state
tsScriptDataEscapeStartState, // DoScriptDataEscapeStartState; //13.2.5.18 Script data escape start state
tsScriptDataEscapeStartDashState, // DoScriptDataEscapeStartDashState; //13.2.5.19 Script data escape start dash state
tsScriptDataEscapedState, // DoScriptDataEscapedState; //13.2.5.20 Script data escaped state
tsScriptDataEscapedDashState, // DoScriptDataEscapedDashState; //13.2.5.21 Script data escaped dash state
tsScriptDataEscapedDashDashState, // DoScriptDataEscapedDashDashState; //13.2.5.22 Script data escaped dash dash state
tsScriptDataEscapedLessThanSignState, // DoScriptDataEscapedLessThanSignState; //13.2.5.23 Script data escaped less-than sign state
tsScriptDataEscapedEndTagOpenState, // DoScriptDataEscapedEndTagOpenState; //13.2.5.24 Script data escaped end tag open state
tsScriptDataEscapedEndTagNameState, // DoScriptDataEscapedEndTagNameState; //13.2.5.25 Script data escaped end tag name state
tsScriptDataDoubleEscapeStartState, // DoScriptDataDoubleEscapeStartState; //13.2.5.26 Script data double escape start state
tsScriptDataDoubleEscapedState, // DoScriptDataDoubleEscapedState; //13.2.5.27 Script data double escaped state
tsScriptDataDoubleEscapedDashState, // DoScriptDataDoubleEscapedDashState; //13.2.5.28 Script data double escaped dash state
tsScriptDataDoubleEscapedDashDashState, // DoScriptDataDoubleEscapedDashDashState; //13.2.5.29 Script data double escaped dash dash state
tsScriptDataDoubleEscapedLessThanSignState, // DoScriptDataDoubleEscapedLessThanSignState; //13.2.5.30 Script data double escaped less-than sign state
tsScriptDataDoubleEscapeEndState, // DoScriptDataDoubleEscapeEndState; //13.2.5.31 Script data double escape end state
tsBeforeAttributeNameState, // DoBeforeAttributeNameState; //13.2.5.32 Before attribute name state
tsAttributeNameState, // DoAttributeNameState; //13.2.5.33 Attribute name state
tsAfterAttributeNameState, // DoAfterAttributeNameState; //13.2.5.34 After attribute name state
tsBeforeAttributeValueState, // DoBeforeAttributeValueState; //13.2.5.35 Before attribute value state
tsAttributeValueDoubleQuotedState, // DoAttributeValueDoubleQuotedState; //13.2.5.36 Attribute value (double-quoted) state
tsAttributeValueSingleQuotedState, // DoAttributeValueSingleQuotedState; //13.2.5.37 Attribute value (single-quoted) state
tsAttributeValueUnquotedState, // DoAttributeValueUnquotedState; //13.2.5.38 Attribute value (unquoted) state
tsAfterAttributeValueQuotedState, // DoAfterAttributeValueQuotedState; //13.2.5.39 After attribute value (quoted) state
tsSelfClosingStartTagState, // DoSelfClosingStartTagState; //13.2.5.40 Self-closing start tag state
tsBogusCommentState, // DoBogusCommentState; //13.2.5.41 Bogus comment state
tsMarkupDeclarationOpenState, // DoMarkupDeclarationOpenState; //13.2.5.42 Markup declaration open state
tsCommentStartState, // DoCommentStartState; //13.2.5.43 Comment start state
tsCommentStartDashState, // DoCommentStartDashState; //13.2.5.44 Comment start dash state
tsCommentState, // DoCommentState; //13.2.5.45 Comment state
tsCommentLessThanSignState, // DoCommentLessThanSignState; //13.2.5.46 Comment less-than sign state
tsCommentLessThanSignBangState, // DoCommentLessThanSignBangState; //13.2.5.47 Comment less-than sign bang state
tsCommentLessThanSignBangDashState, // DoCommentLessThanSignBangDashState; //13.2.5.48 Comment less-than sign bang dash state
tsCommentLessThanSignBangDashDashState, // DoCommentLessThanSignBangDashDashState; //13.2.5.49 Comment less-than sign bang dash dash state
tsCommentEndDashState, // DoCommentEndDashState; //13.2.5.50 Comment end dash state
tsCommentEndState, // DoCommentEndState; //13.2.5.51 Comment end state
tsCommentEndBangState, // DoCommentEndBangState; //13.2.5.52 Comment end bang state
tsDOCTYPEState, // DoDOCTYPEState; //13.2.5.53 DOCTYPE state
tsBeforeDOCTYPENameState, // DoBeforeDOCTYPENameState; //13.2.5.54 Before DOCTYPE name state
tsDOCTYPENameState, // DoDOCTYPENameState; //13.2.5.55 DOCTYPE name state
tsAfterDOCTYPENameState, // DoAfterDOCTYPENameState; //13.2.5.56 After DOCTYPE name state
tsAfterDOCTYPEPublicKeywordState, // DoAfterDOCTYPEPublicKeywordState; //13.2.5.57 After DOCTYPE public keyword state
tsBeforeDOCTYPEPublicIdentifierState, // DoBeforeDOCTYPEPublicIdentifierState; //13.2.5.58 Before DOCTYPE public identifier state
tsDOCTYPEPublicIdentifierDoubleQuotedState, // DoDOCTYPEPublicIdentifierDoubleQuotedState; //13.2.5.59 DOCTYPE public identifier (double-quoted) state
tsDOCTYPEPublicIdentifierSingleQuotedState, // DoDOCTYPEPublicIdentifierSingleQuotedState; //13.2.5.60 DOCTYPE public identifier (single-quoted) state
tsAfterDOCTYPEPublicIdentifierState, // DoAfterDOCTYPEPublicIdentifierState; //13.2.5.61 After DOCTYPE public identifier state
tsBetweenDOCTYPEPublicAndSystemIdentifiersState, // DoBetweenDOCTYPEPublicAndSystemIdentifiersState; //13.2.5.62 Between DOCTYPE public and system identifiers state
tsAfterDOCTYPESystemKeywordState, // DoAfterDOCTYPESystemKeywordState; //13.2.5.63 After DOCTYPE system keyword state
tsBeforeDOCTYPESystemIdentifierState, // DoBeforeDOCTYPESystemIdentifierState; //13.2.5.64 Before DOCTYPE system identifier state
tsDOCTYPESystemIdentifierDoubleQuotedState, // DoDOCTYPESystemIdentifierDoubleQuotedState; //13.2.5.65 DOCTYPE system identifier (double-quoted) state
tsDOCTYPESystemIdentifierSingleQuotedState, // DoDOCTYPESystemIdentifierSingleQuotedState; //13.2.5.66 DOCTYPE system identifier (single-quoted) state
tsAfterDOCTYPESystemIdentifierState, // DoAfterDOCTYPESystemIdentifierState; //13.2.5.67 After DOCTYPE system identifier state
tsBogusDOCTYPEState, // DoBogusDOCTYPEState; //13.2.5.68 Bogus DOCTYPE state
tsCDATASectionState, // DoCDATASectionState; //13.2.5.69 CDATA section state
tsCDATASectionBracketState, // DoCDATASectionBracketState; //13.2.5.70 CDATA section bracket state
tsCDATASectionEndState, // DoCDATASectionEndState; //13.2.5.71 CDATA section end state
tsCharacterReferenceState, // DoCharacterReferenceState; //13.2.5.72 Character reference state
tsNamedCharacterReferenceState, // DoNamedCharacterReferenceState; //13.2.5.73 Named character reference state
tsAmbiguousAmpersandState, // DoAmbiguousAmpersandState; //13.2.5.74 Ambiguous ampersand state
tsNumericCharacterReferenceState, // DoNumericCharacterReferenceState; //13.2.5.75 Numeric character reference state
tsHexadecimalCharacterReferenceStartState, // DoHexadecimalCharacterReferenceStartState; //13.2.5.76 Hexadecimal character reference start state
tsDecimalCharacterReferenceStartState, // DoDecimalCharacterReferenceStartState; //13.2.5.77 Decimal character reference start state
tsHexadecimalCharacterReferenceState, // DoHexadecimalCharacterReferenceState; //13.2.5.78 Hexadecimal character reference state
tsDecimalCharacterReferenceState, // DoDecimalCharacterReferenceState; //13.2.5.79 Decimal character reference state
tsNumericCharacterReferenceEndState // DoNumericCharacterReferenceEndState; //13.2.5.80 Numeric character reference end state
);
//The output of the tokenization step is a series of zero or more of the following tokens:
THtmlTokenType = (
ttDocType, //DOCTYPE (TDocTypeToken)
ttStartTag, //start tag (TStartTagToken)
ttEndTag, //end tag (TEndTagToken)
ttComment, //comment (TCommentToken)
ttCharacter, //character (TCharacterToken)
ttEndOfFile //end-of-file (TEndOfFileToken)
);
{
The InputStream supplies a series of Unicode UCS4 characters to the tokenizer.
The InputStream also takes care of converting any CRLF into LF (as CR is never allowed to reach the HTML tokenizer)
}
TInputStream = class
private
FStream: ISequentialStream;
FEncoding: Word;
FEOF: Boolean;
FBuffer: UCS4String;
FBufferPosition: Integer; //the index into FBuffer that is the "current" position
FBufferSize: Integer; //the number of characters in the ring buffer (from FBufferPosition) that are valid
function IsSurrogate(const n: Word): Boolean;
function GetNextCharacterFromStream: UCS4Char;
function GetNextUTF16Character: UCS4Char;
function Consume: UCS4Char;
function FetchNextCharacterInfoBuffer: Boolean;
procedure LogFmt(const s: string; const Args: array of const);
public
constructor Create(const Html: UnicodeString); overload;
constructor Create(ByteStream: ISequentialStream; Encoding: Word=CP_UTF16); overload;
function TryRead(out ch: UCS4Char): Boolean; //Returns the next UCS4 character value.
function Peek(k: Integer): UCS4Char; //peek the k-th upcoming character
property EOF: Boolean read FEOF;
end;
{
The base class for the tokens emitted by the tokenizer
THtmlToken
- TDocTypeToken
- TTagToken
- TStartTagToken
- TEndTagToken
- TCommentToken
- TCharacterToken
- TEndOfFileToken
}
THtmlToken = class
protected
function GetDescription: string; virtual;
public
TokenType: THtmlTokenType;
constructor Create(ATokenType: THtmlTokenType);
destructor Destroy; override;
property Description: string read GetDescription;
end;
TDocTypeToken = class(THtmlToken)
private
FName: UnicodeString;
FPublicIdentifier: UnicodeString;
FSystemIdentifier: UnicodeString;
FForceQuirks: Boolean;
FNameMissing: Boolean;
FPublicIdentifierMissing: Boolean;
FSystemIdentifierMissing: Boolean;
procedure SetPublicIdentifier(const Value: UnicodeString);
procedure SetSystemIdentifier(const Value: UnicodeString);
procedure SetName(const Value: UnicodeString);
protected
procedure AppendName(const ch: UCS4Char); //to the Name
procedure AppendPublicIdentifier(const ch: UCS4Char);
procedure AppendSystemIdentifier(const ch: UCS4Char);
function GetDescription: string; override;
public
constructor Create;
property Name: UnicodeString read FName write SetName;
property PublicIdentifier: UnicodeString read FPublicIdentifier write SetPublicIdentifier;
property SystemIdentifier: UnicodeString read FSystemIdentifier write SetSystemIdentifier;
property NameMissing: Boolean read FNameMissing;
property PublicIdentifierMissing: Boolean read FPublicIdentifierMissing;
property SystemIdentifierMissing: Boolean read FSystemIdentifierMissing;
property ForceQuirks: Boolean read FForceQuirks write FForceQuirks;
end;
// HTML5: "If designed today they would just have a name and value."
TAttribute = class
private
FIsRemoved: Boolean;
public
Name: UnicodeString;
Value: UnicodeString;
end;
// TAttributes = array of TAttribute;
//Base class of StartTagToken and EndTagToken
TTagToken = class(THtmlToken)
private
FTagName: UnicodeString;
FAttributes: TObjectList; //of TAttribute objects
FSelfClosing: Boolean;
function GetAttributes(i: Integer): TAttribute;
function GetAttributeCount: Integer;
protected
function NewAttribute: TAttribute;
function GetDescription: string; override;
public
constructor Create(ATokenType: THtmlTokenType);
destructor Destroy; override;
procedure AppendCharacter(const ch: UCS4Char); //append to TagName
procedure AddAttribute(Name, Value: UnicodeString);
property TagName: UnicodeString read FTagName write FTagName;
property Attributes[n: Integer]: TAttribute read GetAttributes;
property AttributeCount: Integer read GetAttributeCount;
property SelfClosing: Boolean read FSelfClosing write FSelfClosing;
end;
TStartTagToken = class(TTagToken)
public
constructor Create;
procedure AcknowledgeSelfClosing;
end;
TEndTagToken = class(TTagToken)
protected
function GetDescription: string; override;
public
constructor Create;
end;
TCommentToken = class(THtmlToken)
private
FData: UnicodeString;
protected
function GetDescription: string; override;
public
constructor Create;
procedure AppendCharacter(const ch: UCS4Char); //to Data
property DataString: UnicodeString read FData write FData;
end;
TCharacterToken = class(THtmlToken)
private
FData: UnicodeString;
protected
function GetDescription: string; override;
public
constructor Create;
procedure AppendCharacter(const ch: UCS4Char);
property Data: UnicodeString read FData write FData;
end;
TEndOfFileToken = class(THtmlToken)
protected
function GetDescription: string; override;
public
Data: UCS4String;
constructor Create;
end;
TTokenEvent = procedure(Sender: TObject; AToken: THtmlToken) of object;
THtmlTokenizer = class
private
FLogFlags: Cardinal;
FStream: TInputStream;
FState2: TTokenizerState;
FReturnState2: TTokenizerState;
FCurrentInputCharacter: UCS4Char;
FCurrentToken2: THtmlToken;
FCharacterReferenceCode: Cardinal;
FCurrentAttribute: TAttribute;
FReconsume: Boolean;
FEOF: Boolean;
FTemporaryBuffer: UCS4String;
FNameOfLastEmittedStartTag: string;
FDelayedEmitToken: TCharacterToken;
FParserPause: Boolean;
FOnToken: TTokenEvent; //event handler
FOnParseError: TNotifyEvent;
procedure Initialize;
procedure AddParseError(ParseErrorName: string);
procedure AddNotImplementedParseError(const StateHandlerName: string);
function ExtractCurrentToken: THtmlToken;
procedure FlushDelayedTokens;
procedure FinalizeCurrentAttribute;
function GetNext: UCS4Char;
function IsSurrogate(const ch: UCS4Char): Boolean;
function IsNonCharacter(const ch: UCS4Char): Boolean;
function IsControlC0Character(const ch: UCS4Char): Boolean;
function IsControlCharacter(const ch: UCS4Char): Boolean;
procedure SetCurrentToken(const Value: THtmlToken); //13.2.5.80 Numeric character reference end state
//The output of the tokenization step is a series of zero or more of the following tokens:
// DOCTYPE, start tag, end tag, comment, character, end-of-file.
// DOCTYPE tokens have a name, a public identifier, a system identifier, and a force-quirks flag.
procedure EmitToken(const AToken: THtmlToken);
procedure EmitCurrentToken; //Whatever the current token is
procedure EmitCurrentDocTypeToken; //Emit the current DOCTYPE token
procedure EmitCurrentTagToken; //Emits the current token (whether it be a StartTag or EndTag)
procedure EmitStartTag(StartTag: TStartTagToken); //Emit the current StartTag token
procedure EmitEndTag(EndTag: TEndTagToken); //Emit the current EndTag token
procedure EmitCurrentCommentToken; //Emit the current Comment token
procedure EmitCharacter(const Character: UCS4Char); //Emit a Character token
procedure EmitEndOfFileToken; //Emit an EndOfFile token
procedure Reconsume(NewTokenizerState: TTokenizerState);
procedure SetReturnState(const State: TTokenizerState);
function Consume: UCS4Char;
function Peek: UCS4Char;
function NextFewCharacters(const Value: UnicodeString; const CaseSensitive: Boolean; const IncludingCurrentInputCharacter: Boolean): Boolean;
function GetCurrentTagToken: TTagToken;
function TemporaryBufferIs(const Value: UnicodeString): Boolean;
procedure AppendToTemporaryBuffer(const Value: UCS4Char);
procedure AppendToCurrentAttributeName(const Value: UCS4Char);
procedure AppendToCurrentAttributeValue(const Value: UCS4Char);
procedure AppendToCurrentCommentData(const Value: UCS4Char);
procedure FlushCodePointsConsumed;
function IsAppropriateEndTag(const EndTagToken: TEndTagToken): Boolean;
function IsConsumedAsPartOfAnAttribute: Boolean;
procedure LogFmt(const Fmt: string; const Args: array of const);
property CurrentInputCharacter: UCS4Char read FCurrentInputCharacter; //The current input character is the last character to have been consumed.
property CurrentToken: THtmlToken read FCurrentToken2 write SetCurrentToken;
property CurrentTagToken: TTagToken read GetCurrentTagToken; //The current tag token (either TStartTagtoken or TEndTagToken)
private
//Tokenizer state machine handlers
procedure DoDataState; //13.2.5.1 Data state
procedure DoRCDATAState; //13.2.5.2 RCDATA state
procedure DoRawTextState; //13.2.5.3 RAWTEXT state
procedure DoScriptDataState; //13.2.5.4 Script data state
procedure DoPlaintextState; //13.2.5.5 PLAINTEXT state
procedure DoTagOpenState; //13.2.5.6 Tag open state
procedure DoEndTagOpenState; //13.2.5.7 End tag open state
procedure DoTagNameState; //13.2.5.8 Tag name state
procedure DoRCDATALessThanSignState; //13.2.5.9 RCDATA less-than sign state
procedure DoRCDATAEndTagOpenState; //13.2.5.10 RCDATA end tag open state
procedure DoRCDATAEndTagNameState; //13.2.5.11 RCDATA end tag name state
procedure DoRAWTEXTLessThanSignState; //13.2.5.12 RAWTEXT less-than sign state
procedure DoRAWTEXTEndTagOpenState; //13.2.5.13 RAWTEXT end tag open state
procedure DoRAWTEXTEndTagNameState; //13.2.5.14 RAWTEXT end tag name state
procedure DoScriptDataLessThanSignState; //13.2.5.15 Script data less-than sign state
procedure DoScriptDataEndTagOpenState; //13.2.5.16 Script data end tag open state
procedure DoScriptDataEndTagNameState; //13.2.5.17 Script data end tag name state
procedure DoScriptDataEscapeStartState; //13.2.5.18 Script data escape start state
procedure DoScriptDataEscapeStartDashState; //13.2.5.19 Script data escape start dash state
procedure DoScriptDataEscapedState; //13.2.5.20 Script data escaped state
procedure DoScriptDataEscapedDashState; //13.2.5.21 Script data escaped dash state
procedure DoScriptDataEscapedDashDashState; //13.2.5.22 Script data escaped dash dash state
procedure DoScriptDataEscapedLessThanSignState; //13.2.5.23 Script data escaped less-than sign state
procedure DoScriptDataEscapedEndTagOpenState; //13.2.5.24 Script data escaped end tag open state
procedure DoScriptDataEscapedEndTagNameState; //13.2.5.25 Script data escaped end tag name state
procedure DoScriptDataDoubleEscapeStartState; //13.2.5.26 Script data double escape start state
procedure DoScriptDataDoubleEscapedState; //13.2.5.27 Script data double escaped state
procedure DoScriptDataDoubleEscapedDashState; //13.2.5.28 Script data double escaped dash state
procedure DoScriptDataDoubleEscapedDashDashState; //13.2.5.29 Script data double escaped dash dash state
procedure DoScriptDataDoubleEscapedLessThanSignState; //13.2.5.30 Script data double escaped less-than sign state
procedure DoScriptDataDoubleEscapeEndState; //13.2.5.31 Script data double escape end state
procedure DoBeforeAttributeNameState; //13.2.5.32 Before attribute name state
procedure DoAttributeNameState; //13.2.5.33 Attribute name state
procedure DoAfterAttributeNameState; //13.2.5.34 After attribute name state
procedure DoBeforeAttributeValueState; //13.2.5.35 Before attribute value state
procedure DoAttributeValueDoubleQuotedState; //13.2.5.36 Attribute value (double-quoted) state
procedure DoAttributeValueSingleQuotedState; //13.2.5.37 Attribute value (single-quoted) state
procedure DoAttributeValueUnquotedState; //13.2.5.38 Attribute value (unquoted) state
procedure DoAfterAttributeValueQuotedState; //13.2.5.39 After attribute value (quoted) state
procedure DoSelfClosingStartTagState; //13.2.5.40 Self-closing start tag state
procedure DoBogusCommentState; //13.2.5.41 Bogus comment state
procedure DoMarkupDeclarationOpenState;
procedure DoCommentStartState; //13.2.5.43 Comment start state
procedure DoCommentStartDashState; //13.2.5.44 Comment start dash state
procedure DoCommentState; //13.2.5.45 Comment state
procedure DoCommentLessThanSignState; //13.2.5.46 Comment less-than sign state
procedure DoCommentLessThanSignBangState; //13.2.5.47 Comment less-than sign bang state
procedure DoCommentLessThanSignBangDashState; //13.2.5.48 Comment less-than sign bang dash state
procedure DoCommentLessThanSignBangDashDashState; //13.2.5.49 Comment less-than sign bang dash dash state
procedure DoCommentEndDashState; //13.2.5.50 Comment end dash state
procedure DoCommentEndState; //13.2.5.51 Comment end state
procedure DoCommentEndBangState; //13.2.5.52 Comment end bang state
procedure DoDOCTYPEState; //13.2.5.53 DOCTYPE state
procedure DoBeforeDOCTYPENameState; //13.2.5.54 Before DOCTYPE name state
procedure DoDOCTYPENameState; //13.2.5.55 DOCTYPE name state
procedure DoAfterDOCTYPENameState; //13.2.5.56 After DOCTYPE name state
procedure DoAfterDOCTYPEPublicKeywordState; //13.2.5.57 After DOCTYPE public keyword state
procedure DoBeforeDOCTYPEPublicIdentifierState; //13.2.5.58 Before DOCTYPE public identifier state
procedure DoDOCTYPEPublicIdentifierDoubleQuotedState; //13.2.5.59 DOCTYPE public identifier (double-quoted) state
procedure DoDOCTYPEPublicIdentifierSingleQuotedState; //13.2.5.60 DOCTYPE public identifier (single-quoted) state
procedure DoAfterDOCTYPEPublicIdentifierState; //13.2.5.61 After DOCTYPE public identifier state
procedure DoBetweenDOCTYPEPublicAndSystemIdentifiersState; //13.2.5.62 Between DOCTYPE public and system identifiers state
procedure DoAfterDOCTYPESystemKeywordState; //13.2.5.63 After DOCTYPE system keyword state
procedure DoBeforeDOCTYPESystemIdentifierState; //13.2.5.64 Before DOCTYPE system identifier state
procedure DoDOCTYPESystemIdentifierDoubleQuotedState; //13.2.5.65 DOCTYPE system identifier (double-quoted) state
procedure DoDOCTYPESystemIdentifierSingleQuotedState; //13.2.5.66 DOCTYPE system identifier (single-quoted) state
procedure DoAfterDOCTYPESystemIdentifierState; //13.2.5.67 After DOCTYPE system identifier state
procedure DoBogusDOCTYPEState; //13.2.5.68 Bogus DOCTYPE state
procedure DoCDATASectionState; //13.2.5.69 CDATA section state
procedure DoCDATASectionBracketState; //13.2.5.70 CDATA section bracket state
procedure DoCDATASectionEndState; //13.2.5.71 CDATA section end state
procedure DoCharacterReferenceState; //13.2.5.72 Character reference state
procedure DoNamedCharacterReferenceState; //13.2.5.73 Named character reference state
procedure DoAmbiguousAmpersandState; //13.2.5.74 Ambiguous ampersand state
procedure DoNumericCharacterReferenceState; //13.2.5.75 Numeric character reference state
procedure DoHexadecimalCharacterReferenceStartState; //13.2.5.76 Hexadecimal character reference start state
procedure DoDecimalCharacterReferenceStartState; //13.2.5.77 Decimal character reference start state
procedure DoHexadecimalCharacterReferenceState; //13.2.5.78 Hexadecimal character reference state
procedure DoDecimalCharacterReferenceState; //13.2.5.79 Decimal character reference state
procedure DoNumericCharacterReferenceEndState;
public
constructor Create(Html: UnicodeString);
destructor Destroy; override;
procedure Parse;
procedure SetState(const State: TTokenizerState); //tree construction has two situations where it needs to change the tokenzier state
procedure SetLastStartTag(const LastStartTagName: string); //some test cases, and i think templates, need to let you set the LastStartTag
property ParserPause: Boolean read FParserPause write FParserPause;
property LoggingFlags: Cardinal read FLogFlags write FLogFlags;
property OnToken: TTokenEvent read FOnToken write FOnToken;
end;
const
LOG_TOKENIZER_CONSUME = $00000001;
LOG_TOKENIZER_STATECHANGE = $00000002;
LOG_TOKENIZER_EMITTOKEN = $00000004;
LOG_TOKENIZER_PARSEERROR = $00000008;
LOG_TOKENIZER_PEEK = $00000010;
type
{
Delphi 5 had the issue with .Stat implementation:
Potential issue in TStreamAdapter.Stat implementation
http://qc.embarcadero.com/wc/qcmain.aspx?d=45528
Alpha Blended Splash Screen in Delphi - Part 2
http://melander.dk/articles/alphasplash2/2/
The problem with TStreamAdapter is in its implementation of the IStream.stat
method. The stat method takes two parameters: A STATSTG out parameter and a
STATFLAG value. The STATFLAG value specifies if the stat method should return
a value in the STATSTG.pwcsName member. If it does return a value, it is the
responsibility of the called object (i.e. TStreamAdapter) to allocate memory
for the string value, and the responsibility of the caller (i.e. GDI+) to
deallocate the string. Now TStreamAdapter.stat completely ignores the STATFLAG
parameter, which is understandable because it doesnt know anything about
filenames, but unfortunately it also fails to zero the STATSTG.pwcsName member.
The result is that the caller (GDI+ in this case) receives an invalid string
That was fixed by the time XE6 came along, but there's another bug (10.3)
The .Read method is supposed to return S_FALSE if the number of bytes read
was less than the number of bytes requested.
And it's supposed to return an error if there was an error (rather than success).
}
TFixedStreamAdapter = class(TStreamAdapter)
public
function Read(pv: Pointer; cb: FixedUInt; pcbRead: PFixedUInt): HResult; override; stdcall;
end;
//Again, the version is Dephi is buggy. So we fix their bugs for them.
function UCS4ToUnicodeString(const S: UCS4String): UnicodeString;
function UCS4StringToUnicodeString(const S: UCS4String): UnicodeString;
procedure UCS4StrCat(var Dest: UCS4String; const Source: UCS4Char); //similar to System._UStrCat
procedure UCS4StrFromChar(var Dest: UCS4String; const Source: UCS4Char); //similar to System._UStrFromChar
procedure UCS4StrFromUStr(var Dest: UCS4String; const Source: UnicodeString);
function UCS4StrCopy(const S: UCS4String; Index, Count: Integer): UCS4String; //similar to System._UStrCopy
procedure UCS4StrFromPUCS4CharLen(var Dest: UCS4String; Source: PUCS4Char; CharLength: Integer); //similar to System._UStrFromPWCharLen
function UCS4CharToUnicodeString(const ch: UCS4Char): UnicodeString; //either 1 or 2 WideChar
implementation
uses
Windows, TypInfo, ComObj,
{$IFDEF UnitTests}HtmlTokenizerTests,{$ENDIF}
HtmlTags;
const
//https://infra.spec.whatwg.org/#code-points
asciiTabOrNewline = [$0009, $000A, $000D]; //TAB, LF, CR. https://infra.spec.whatwg.org/#ascii-tab-or-newline
asciiWhitespace = [$0009, $000A, $000C, $000D, $0020]; //TAB, LF, FF, CR, SPACE. //https://infra.spec.whatwg.org/#ascii-whitespace
asciiDigit = [Ord('0')..Ord('9')]; //https://infra.spec.whatwg.org/#ascii-digit
asciiUpperHexDigit = [Ord('A')..Ord('F')]; //https://infra.spec.whatwg.org/#ascii-upper-hex-digit
asciiLowerHexDigit = [Ord('a')..Ord('f')]; //https://infra.spec.whatwg.org/#ascii-lower-hex-digit
asciiHexDigit = asciiUpperHexDigit + asciiLowerHexDigit; //https://infra.spec.whatwg.org/#ascii-hex-digit
asciiUpperAlpha = [Ord('A')..Ord('Z')]; //https://infra.spec.whatwg.org/#ascii-upper-alpha
asciiLowerAlpha = [Ord('a')..Ord('z')]; //https://infra.spec.whatwg.org/#ascii-lower-alpha
asciiAlpha = asciiUpperAlpha + asciiLowerAlpha; //https://infra.spec.whatwg.org/#ascii-alpha
asciiAlphaNumeric = asciiDigit + asciiAlpha; //https://infra.spec.whatwg.org/#ascii-alphanumeric
UEOF = UCS4Char(-1); //A special EOF unicode character
{ THtmlTokenizer }
procedure THtmlTokenizer.LogFmt(const Fmt: string; const Args: array of const);
var
s: string;
begin
if IsDebuggerPresent then
begin
s := Format(Fmt, Args);
OutputDebugString(PChar(s));
end;
end;
function THtmlTokenizer.NextFewCharacters(const Value: UnicodeString;
const CaseSensitive: Boolean;
const IncludingCurrentInputCharacter: Boolean): Boolean;
var
ch: UCS4Char;
wc: WideChar;
i: Integer;
nStart: Integer;
peek: UnicodeString;
peekOffset: Integer;
begin
Result := False;
if Value = '' then
raise Exception.Create('NextFewCharacters peek value cannot be empty');
SetLength(peek, Length(Value));
nStart := 1;
if IncludingCurrentInputCharacter then
begin
if FCurrentInputCharacter > $FFFF then
begin
LogFmt('Got extended unicode character while peeking. Leaving. (0x%.8x)', [FCurrentInputCharacter]);
Exit;
end;
wc := WideChar(FCurrentInputCharacter);
peek[1] := wc;
Inc(nStart);
end;
peekOffset := 1;
for i := nStart to Length(Value) do
begin
ch := FStream.Peek(peekOffset);
if ch > $FFFF then
Exit;
wc := WideChar(ch);
peek[i] := wc;
Inc(peekOffset);
end;
if CaseSensitive then
Result := (peek = Value)
else
Result := SameText(peek, Value);
end;
procedure THtmlTokenizer.AddNotImplementedParseError(const StateHandlerName: string);
begin
AddParseError('not-implemented-'+StateHandlerName);
raise ENotImplemented.Create(StateHandlerName);
end;
procedure THtmlTokenizer.AddParseError(ParseErrorName: string);
begin
if (FLogFlags and LOG_TOKENIZER_PARSEERROR) <> 0 then
LogFmt('Parse Error: %s', [ParseErrorName]);
if Assigned(FOnParseError) then
FOnParseError(Self);
end;
procedure THtmlTokenizer.AppendToCurrentAttributeName(const Value: UCS4Char);
begin
if FCurrentAttribute = nil then
raise Exception.Create('AppendToCurrentAttributeName, but CurrentAttribute is nil');
FCurrentAttribute.Name := FCurrentAttribute.Name + UCS4CharToUnicodeString(Value);
end;
procedure THtmlTokenizer.AppendToCurrentAttributeValue(const Value: UCS4Char);
begin
if FCurrentAttribute = nil then
raise EParserError.Create('AppendToCurrentAttributeValue, but CurrentAttribute is nil');
FCurrentAttribute.Value := FCurrentAttribute.Value + UCS4CharToUnicodeString(Value);
end;
procedure THtmlTokenizer.AppendToCurrentCommentData(const Value: UCS4Char);
begin
(CurrentToken as TCommentToken).AppendCharacter(Value);
end;
procedure THtmlTokenizer.AppendToTemporaryBuffer(const Value: UCS4Char);
begin
UCS4StrCat(FTemporaryBuffer, Value);
end;
function THtmlTokenizer.Consume: UCS4Char;
begin
if (FReconsume) then
begin
FReconsume := False;
end
else
begin
FCurrentInputCharacter := Self.GetNext;
end;
Result := FCurrentInputCharacter;
if (FLogFlags and LOG_TOKENIZER_CONSUME) <> 0 then
LogFmt('<== U+%.8x (''%s'')', [Result, WideChar(Result)]);
end;
constructor THtmlTokenizer.Create(Html: UnicodeString);
begin
inherited Create;
Initialize;
FStream := TInputStream.Create(Html);
end;
procedure THtmlTokenizer.Parse;
begin
while not FEOF do
begin
{
Before each step of the tokenizer,
the user agent must first check the parser pause flag.
If it is true, then the tokenizer must abort the processing of any nested
invocations of the tokenizer, yielding control back to the caller.
}
if ParserPause then
Break;
case FState2 of
tsDataState: DoDataState; //13.2.5.1 Data state
tsRCDataState: DoRCDATAState; //13.2.5.2 RCDATA state
tsRawTextState: DoRawTextState; //13.2.5.3 RAWTEXT state
tsScriptDataState: DoScriptDataState; //13.2.5.4 Script data state
tsPlaintextState: DoPlaintextState; //13.2.5.5 PLAINTEXT state
tsTagOpenState: DoTagOpenState; //13.2.5.6 Tag open state
tsEndTagOpenState: DoEndTagOpenState; //13.2.5.7 End tag open state
tsTagNameState: DoTagNameState; //13.2.5.8 Tag name state
tsRCDATALessThanSignState: DoRCDATALessThanSignState; //13.2.5.9 RCDATA less-than sign state
tsRCDATAEndTagOpenState: DoRCDATAEndTagOpenState; //13.2.5.10 RCDATA end tag open state
tsRCDATAEndTagNameState: DoRCDATAEndTagNameState; //13.2.5.11 RCDATA end tag name state
tsRAWTEXTLessThanSignState: DoRAWTEXTLessThanSignState; //13.2.5.12 RAWTEXT less-than sign state
tsRAWTEXTEndTagOpenState: DoRAWTEXTEndTagOpenState; //13.2.5.13 RAWTEXT end tag open state
tsRAWTEXTEndTagNameState: DoRAWTEXTEndTagNameState; //13.2.5.14 RAWTEXT end tag name state
tsScriptDataLessThanSignState: DoScriptDataLessThanSignState; //13.2.5.15 Script data less-than sign state
tsScriptDataEndTagOpenState: DoScriptDataEndTagOpenState; //13.2.5.16 Script data end tag open state
tsScriptDataEndTagNameState: DoScriptDataEndTagNameState; //13.2.5.17 Script data end tag name state
tsScriptDataEscapeStartState: DoScriptDataEscapeStartState; //13.2.5.18 Script data escape start state
tsScriptDataEscapeStartDashState: DoScriptDataEscapeStartDashState; //13.2.5.19 Script data escape start dash state
tsScriptDataEscapedState: DoScriptDataEscapedState; //13.2.5.20 Script data escaped state
tsScriptDataEscapedDashState: DoScriptDataEscapedDashState; //13.2.5.21 Script data escaped dash state
tsScriptDataEscapedDashDashState: DoScriptDataEscapedDashDashState; //13.2.5.22 Script data escaped dash dash state
tsScriptDataEscapedLessThanSignState: DoScriptDataEscapedLessThanSignState; //13.2.5.23 Script data escaped less-than sign state
tsScriptDataEscapedEndTagOpenState: DoScriptDataEscapedEndTagOpenState; //13.2.5.24 Script data escaped end tag open state
tsScriptDataEscapedEndTagNameState: DoScriptDataEscapedEndTagNameState; //13.2.5.25 Script data escaped end tag name state
tsScriptDataDoubleEscapeStartState: DoScriptDataDoubleEscapeStartState; //13.2.5.26 Script data double escape start state
tsScriptDataDoubleEscapedState: DoScriptDataDoubleEscapedState; //13.2.5.27 Script data double escaped state
tsScriptDataDoubleEscapedDashState: DoScriptDataDoubleEscapedDashState; //13.2.5.28 Script data double escaped dash state
tsScriptDataDoubleEscapedDashDashState: DoScriptDataDoubleEscapedDashDashState; //13.2.5.29 Script data double escaped dash dash state
tsScriptDataDoubleEscapedLessThanSignState: DoScriptDataDoubleEscapedLessThanSignState; //13.2.5.30 Script data double escaped less-than sign state
tsScriptDataDoubleEscapeEndState: DoScriptDataDoubleEscapeEndState; //13.2.5.31 Script data double escape end state
tsBeforeAttributeNameState: DoBeforeAttributeNameState; //13.2.5.32 Before attribute name state
tsAttributeNameState: DoAttributeNameState; //13.2.5.33 Attribute name state
tsAfterAttributeNameState: DoAfterAttributeNameState; //13.2.5.34 After attribute name state
tsBeforeAttributeValueState: DoBeforeAttributeValueState; //13.2.5.35 Before attribute value state
tsAttributeValueDoubleQuotedState: DoAttributeValueDoubleQuotedState; //13.2.5.36 Attribute value (double-quoted) state
tsAttributeValueSingleQuotedState: DoAttributeValueSingleQuotedState; //13.2.5.37 Attribute value (single-quoted) state
tsAttributeValueUnquotedState: DoAttributeValueUnquotedState; //13.2.5.38 Attribute value (unquoted) state
tsAfterAttributeValueQuotedState: DoAfterAttributeValueQuotedState; //13.2.5.39 After attribute value (quoted) state
tsSelfClosingStartTagState: DoSelfClosingStartTagState; //13.2.5.40 Self-closing start tag state
tsBogusCommentState: DoBogusCommentState; //13.2.5.41 Bogus comment state
tsMarkupDeclarationOpenState: DoMarkupDeclarationOpenState; //13.2.5.42 Markup declaration open state
tsCommentStartState: DoCommentStartState; //13.2.5.43 Comment start state
tsCommentStartDashState: DoCommentStartDashState; //13.2.5.44 Comment start dash state
tsCommentState: DoCommentState; //13.2.5.45 Comment state
tsCommentLessThanSignState: DoCommentLessThanSignState; //13.2.5.46 Comment less-than sign state
tsCommentLessThanSignBangState: DoCommentLessThanSignBangState; //13.2.5.47 Comment less-than sign bang state
tsCommentLessThanSignBangDashState: DoCommentLessThanSignBangDashState; //13.2.5.48 Comment less-than sign bang dash state
tsCommentLessThanSignBangDashDashState: DoCommentLessThanSignBangDashDashState; //13.2.5.49 Comment less-than sign bang dash dash state
tsCommentEndDashState: DoCommentEndDashState; //13.2.5.50 Comment end dash state
tsCommentEndState: DoCommentEndState; //13.2.5.51 Comment end state
tsCommentEndBangState: DoCommentEndBangState; //13.2.5.52 Comment end bang state
tsDOCTYPEState: DoDOCTYPEState; //13.2.5.53 DOCTYPE state
tsBeforeDOCTYPENameState: DoBeforeDOCTYPENameState; //13.2.5.54 Before DOCTYPE name state
tsDOCTYPENameState: DoDOCTYPENameState; //13.2.5.55 DOCTYPE name state
tsAfterDOCTYPENameState: DoAfterDOCTYPENameState; //13.2.5.56 After DOCTYPE name state
tsAfterDOCTYPEPublicKeywordState: DoAfterDOCTYPEPublicKeywordState; //13.2.5.57 After DOCTYPE public keyword state
tsBeforeDOCTYPEPublicIdentifierState: DoBeforeDOCTYPEPublicIdentifierState; //13.2.5.58 Before DOCTYPE public identifier state
tsDOCTYPEPublicIdentifierDoubleQuotedState: DoDOCTYPEPublicIdentifierDoubleQuotedState; //13.2.5.59 DOCTYPE public identifier (double-quoted) state
tsDOCTYPEPublicIdentifierSingleQuotedState: DoDOCTYPEPublicIdentifierSingleQuotedState; //13.2.5.60 DOCTYPE public identifier (single-quoted) state
tsAfterDOCTYPEPublicIdentifierState: DoAfterDOCTYPEPublicIdentifierState; //13.2.5.61 After DOCTYPE public identifier state
tsBetweenDOCTYPEPublicAndSystemIdentifiersState: DoBetweenDOCTYPEPublicAndSystemIdentifiersState; //13.2.5.62 Between DOCTYPE public and system identifiers state
tsAfterDOCTYPESystemKeywordState: DoAfterDOCTYPESystemKeywordState; //13.2.5.63 After DOCTYPE system keyword state
tsBeforeDOCTYPESystemIdentifierState: DoBeforeDOCTYPESystemIdentifierState; //13.2.5.64 Before DOCTYPE system identifier state
tsDOCTYPESystemIdentifierDoubleQuotedState: DoDOCTYPESystemIdentifierDoubleQuotedState; //13.2.5.65 DOCTYPE system identifier (double-quoted) state
tsDOCTYPESystemIdentifierSingleQuotedState: DoDOCTYPESystemIdentifierSingleQuotedState; //13.2.5.66 DOCTYPE system identifier (single-quoted) state
tsAfterDOCTYPESystemIdentifierState: DoAfterDOCTYPESystemIdentifierState; //13.2.5.67 After DOCTYPE system identifier state
tsBogusDOCTYPEState: DoBogusDOCTYPEState; //13.2.5.68 Bogus DOCTYPE state
tsCDATASectionState: DoCDATASectionState; //13.2.5.69 CDATA section state
tsCDATASectionBracketState: DoCDATASectionBracketState; //13.2.5.70 CDATA section bracket state
tsCDATASectionEndState: DoCDATASectionEndState; //13.2.5.71 CDATA section end state
tsCharacterReferenceState: DoCharacterReferenceState; //13.2.5.72 Character reference state
tsNamedCharacterReferenceState: DoNamedCharacterReferenceState; //13.2.5.73 Named character reference state
tsAmbiguousAmpersandState: DoAmbiguousAmpersandState; //13.2.5.74 Ambiguous ampersand state
tsNumericCharacterReferenceState: DoNumericCharacterReferenceState; //13.2.5.75 Numeric character reference state
tsHexadecimalCharacterReferenceStartState: DoHexadecimalCharacterReferenceStartState; //13.2.5.76 Hexadecimal character reference start state
tsDecimalCharacterReferenceStartState: DoDecimalCharacterReferenceStartState; //13.2.5.77 Decimal character reference start state
tsHexadecimalCharacterReferenceState: DoHexadecimalCharacterReferenceState; //13.2.5.78 Hexadecimal character reference state
tsDecimalCharacterReferenceState: DoDecimalCharacterReferenceState; //13.2.5.79 Decimal character reference state
tsNumericCharacterReferenceEndState: DoNumericCharacterReferenceEndState; //13.2.5.80 Numeric character reference end state
else
//unknown state? There's no way out.
AddParseError('Unknown-parser-state-'+TypInfo.GetEnumName(TypeInfo(TTokenizerState), Ord(FState2)));
Break;
end;
end;
FlushDelayedTokens;
end;
function THtmlTokenizer.Peek: UCS4Char;
begin
if (FReconsume) then
begin
Result := FCurrentInputCharacter;
Exit;
end;
Result := FStream.Peek(1);
if (FLogFlags and LOG_TOKENIZER_PEEK) <> 0 then
LogFmt('<== U+%.8x (''%s'') PEEK', [Result, WideChar(Result)]);
end;
procedure THtmlTokenizer.DoDataState;
var
ch: UCS4Char;
begin
//13.2.5.1 Data state
//https://html.spec.whatwg.org/multipage/parsing.html#data-state
ch := Consume; //consume the next input character
case ch of
$0026: //U+0026 AMPERSAND (&)
begin
SetReturnState(tsDataState);
SetState(tsCharacterReferenceState);
end;
$003C: //U+003C LESS-THAN SIGN (<)
begin
SetState(tsTagOpenState);
end;
$0000: //U+0000 NULL
begin
AddParseError('unexpected-null-character');
EmitCharacter(FCurrentInputCharacter);
end;
UEOF: EmitEndOfFileToken;
else
EmitCharacter(FCurrentInputCharacter);
end;
end;
procedure THtmlTokenizer.DoRCDATAState;
var
ch: UCS4Char;
begin
//13.2.5.2 RCDATA state
//https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
ch := Consume; //consume the next input character
case ch of
$0026: //U+0026 AMPERSAND (&)
begin
SetReturnState(tsRCDATAState);
SetState(tsCharacterReferenceState);
end;
$003C: SetState(tsRCDATALessThanSignState); //U+003C LESS-THAN SIGN
$0000: //U+0000 NULL
begin
AddParseError('unexpected-null-character');
EmitCharacter($FFFD);
end;
UEOF: EmitEndOfFileToken;
else
EmitCharacter(FCurrentInputCharacter);
end;
end;
procedure THtmlTokenizer.DoRawTextState;
var
ch: UCS4Char;
begin
//13.2.5.3 RAWTEXT state
//https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
ch := Consume; //consume the next input character
case ch of
Ord('<'): SetState(tsRawTextLessThanSignState);
$0000: //U+0000 NULL
begin
AddParseError('unexpected-null-character');
EmitCharacter($0000FFFD); //Emit a U+FFFD REPLACEMENT CHARACTER character token.
end;
UEOF: EmitEndOfFileToken;
else
EmitCharacter(FCurrentInputCharacter);
end;
end;
procedure THtmlTokenizer.DoScriptDataState;
var
ch: UCS4Char;
begin
//13.2.5.4 Script data state
//https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
ch := Consume; //consume the next input character
case ch of
$003C: SetState(tsScriptDataLessThanSignState); //U+003C LESS-THAN SIGN (<)
$0000: //U+0000 NULL
begin
AddParseError('unexpected-null-character');
EmitCharacter($0000FFFD); //Emit a U+FFFD REPLACEMENT CHARACTER character token.
end;
UEOF: EmitEndOfFileToken;
else
EmitCharacter(FCurrentInputCharacter);
end;
end;
procedure THtmlTokenizer.DoPlaintextState;
var
ch: UCS4Char;
begin
//13.2.5.5 PLAINTEXT state
//https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
ch := Consume; //consume the next input character
case ch of
$0000: //U+0000 NULL
begin
AddParseError('unexpected-null-character');
EmitCharacter($0000FFFD); //Emit a U+FFFD REPLACEMENT CHARACTER character token.
end;
UEOF: EmitEndOfFileToken;
else
EmitCharacter(FCurrentInputCharacter);
end;
end;
procedure THtmlTokenizer.DoTagOpenState;
var
ch: UCS4Char;
begin
//13.2.5.6 Tag open state
//https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
ch := Consume; //consume the next input character
if ch = Ord('!') then //U+0021 EXCLAMATION MARK (!)
begin
SetState(tsMarkupDeclarationOpenState); //Switch to the markup declaration open state.
end
else if ch = Ord('/') then //U+002F SOLIDUS (/)
begin
SetState(tsEndTagOpenState); //Switch to the end tag open state.
end
else if ch in asciiAlpha then
begin
CurrentToken := TStartTagToken.Create; //Create a new start tag token, set its tag name to the empty string
Reconsume(tsTagNameState); //Reconsume in the tag name state.
end
else if ch = Ord('?') then //U+003F QUESTION MARK (?)
begin
AddParseError('unexpected-question-mark-instead-of-tag-name'); //This is an unexpected-question-mark-instead-of-tag-name parse error
CurrentToken := TCommentToken.Create; //Create a comment token whose data is the empty string.
Reconsume(tsBogusCommentState); //Reconsume in the bogus comment state.
end
else if ch = UEOF then
begin
AddParseError('eof-before-tag-name'); //This is an eof-before-tag-name parse error.
EmitCharacter($003C); //Emit a U+003C LESS-THAN SIGN character token
EmitEndOfFileToken; //and an end-of-file token.
end
else
begin
AddParseError('invalid-first-character-of-tag-name'); //This is an invalid-first-character-of-tag-name parse error.
EmitCharacter($003C); //Emit a U+003C LESS-THAN SIGN character token
Reconsume(tsDataState); //Reconsume in the data state.
end;
end;
procedure THtmlTokenizer.DoEndTagOpenState;
var
ch: UCS4Char;
begin
//13.2.5.7 End tag open state
//https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
ch := Consume; //consume the next input character
if ch in asciiAlpha then
begin
CurrentToken := TEndTagToken.Create; //Create a new end tag token, set its tag name to the empty string.
Reconsume(tsTagNameState); //Reconsume in the tag name state.
end
else if ch = Ord('>') then //U+003E GREATER-THAN SIGN (>)
begin
AddParseError('missing-end-tag-name');
SetState(tsDataState);
end
else if ch = UEOF then
begin
AddParseError('eof-before-tag-name'); //This is an eof-before-tag-name parse error.
EmitCharacter($003C); //Emit a U+003C LESS-THAN SIGN character token,
EmitCharacter($002F); //a U+002F SOLIDUS character token
EmitEndOfFileToken; //and an end-of-file token
end
else
begin
AddParseError('invalid-first-character-of-tag-name'); //This is an invalid-first-character-of-tag-name parse error.
CurrentToken := TCommentToken.Create; //Create a comment token whose data is the empty string.
Reconsume(tsBogusCommentState); //Reconsume in the bogus comment state.
end;
end;
procedure THtmlTokenizer.DoTagNameState;
var
ch: UCS4Char;
begin
//13.2.5.8 Tag name state
//https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
ch := Consume; //consume the next input character
if ch in [$0009, $000A, $000C, $0020] then
begin
//U+0009 CHARACTER TABULATION (tab)
//U+000A LINE FEED (LF)
//U+000C FORM FEED (FF)
//U+0020 SPACE
SetState(tsBeforeAttributeNameState); //Switch to the before attribute name state.
end
else if ch = $002F then //U+002F SOLIDUS (/)
begin
SetState(tsSelfClosingStartTagState); //Switch to the self-closing start tag state.
end
else if ch = $003E then //U+003E GREATER-THAN SIGN (>)
begin
SetState(tsDataState); //Switch to the data state.
EmitCurrentTagToken; //Emit the current tag token.
end
else if ch in asciiUpperAlpha then
begin
//Append the lowercase version of the current input character (add 0x0020 to the character's code point)
//to the current tag token's tag name.
CurrentTagToken.AppendCharacter(ch + $0020);
end
else if ch = $0000 then //U+0000 NULL
begin
AddParseError('unexpected-null-character');
CurrentTagToken.AppendCharacter($FFFD); //Append a U+FFFD REPLACEMENT CHARACTER character to the current tag token's tag name.
end
else if ch = UEOF then
begin
AddParseError('eof-in-tag');
EmitEndOfFileToken; //Emit an end-of-file token.
end
else
begin
CurrentTagToken.AppendCharacter(FCurrentInputCharacter); //Append the current input character to the current tag token's tag name.
end;
end;
procedure THtmlTokenizer.DoRCDATALessThanSignState;
var
ch: UCS4Char;
begin