{$MODE OBJFPC} { -*- text -*- } {$INCLUDE settings.inc} unit stringutils; interface {$IFOPT C+} const kBlank = High(QWord); {$ENDIF} // XXX support TGravitationalSystemValue with explicit required units, e.g. "1.4kg/m^3" // XXX support reading these out like a stream type {$IFOPT C+} TBlank = kBlank..kBlank; {$ENDIF} TTokenType = (ttNone, ttInteger, ttString, ttComma, ttError); TToken = record case TokenType: TTokenType of ttNone: ({$IFOPT C+} Blank: TBlank {$ENDIF}); ttInteger: (IntegerValue: Integer); ttString: (StringValue: ^UTF8String); ttComma: (); ttError: (); end; TTokenArray = array of TToken; function GetLineTokens(Line: UTF8String; StopWord: UTF8String = ''): TTokenArray; procedure FreeLineTokens(var TokenArray: TTokenArray); function IsValidUTF8(const Value: UTF8String): Boolean; implementation uses sysutils; function GetLineTokens(Line: UTF8String; StopWord: UTF8String = ''): TTokenArray; type TTokeniserMode = (tmStart, tmNumeric, tmWord, tmQuoted); var Mode: TTokeniserMode; StartIndex, Index: Cardinal; Token: TToken; SawStopWord: Boolean; procedure Start(NewMode: TTokeniserMode); begin Assert(Token.TokenType = ttNone); StartIndex := Index; Mode := NewMode; end; procedure PushToken(); begin Assert(Token.TokenType in [ttInteger, ttString, ttComma, ttError]); SetLength(Result, Length(Result)+1); Result[High(Result)] := Token; Token.TokenType := ttNone; {$IFOPT C+} Token.Blank := kBlank; {$ENDIF} Mode := tmStart; end; procedure Error(); begin Token.TokenType := ttError; PushToken(); Assert(Mode = tmStart); end; procedure FinishInteger(); var FailPos: Integer; begin Assert(Token.TokenType = ttNone); Token.TokenType := ttInteger; FailPos := -1; {$IFOPT R-} {$DEFINE range_checks_off} {$RANGECHECKS ON} {$ENDIF} try Val(Copy(Line, StartIndex, Index-StartIndex), Token.IntegerValue, FailPos); except on ERangeError do ; end; {$IFDEF range_checks_off} {$RANGECHECKS OFF} {$UNDEF range_checks_off} {$ENDIF} if (FailPos <> 0) then Error() else PushToken(); Assert(Mode = tmStart); end; procedure FinishWord(); begin Assert(Token.TokenType = ttNone); Token.TokenType := ttString; Assert(Index > StartIndex); New(Token.StringValue); Assert(Token.StringValue^ = ''); Token.StringValue^ := Copy(Line, StartIndex, Index-StartIndex); if (Token.StringValue^ = StopWord) then begin Assert(StopWord <> ''); SawStopWord := True; end; PushToken(); Assert(Mode = tmStart); end; procedure FinishQuotedString(); begin Assert(Token.TokenType = ttNone); Token.TokenType := ttString; Assert(Index > StartIndex); Assert(Line[StartIndex] = '"'); Assert(Line[Index] = '"'); New(Token.StringValue); Assert(Token.StringValue^ = ''); if (Index-StartIndex > 2) then Token.StringValue^ := Copy(Line, StartIndex+1, Index-StartIndex-1); PushToken(); Assert(Mode = tmStart); end; procedure Comma(); begin Assert(Token.TokenType = ttNone); Token.TokenType := ttComma; PushToken(); Assert(Mode = tmStart); end; begin SetLength(Result, 0); Token.TokenType := ttNone; {$IFOPT C+} Token.Blank := kBlank; {$ENDIF} if (Length(Line) > 0) then begin SawStopWord := False; Mode := tmStart; for Index := 1 to Length(Line) do begin case Mode of tmStart: case Line[Index] of ' ', #9: ; '0'..'9', '-': begin if (SawStopWord) then begin Start(tmWord); Break; end else Start(tmNumeric); end; '"': begin if (SawStopWord) then Error(); Start(tmQuoted); end; ',': Error(); else begin Start(tmWord); if (SawStopWord) then Break; end; end; tmNumeric: case Line[Index] of ' ', #9: FinishInteger(); '0'..'9': ; '-': begin FinishInteger(); Error(); Start(tmNumeric); end; '"': begin FinishInteger(); Error(); Start(tmQuoted); end; ',': begin FinishInteger(); Comma(); end; else begin FinishInteger(); Error(); Start(tmWord); if (SawStopWord) then Break; end; end; tmWord: case Line[Index] of ' ', #9: begin Assert(not SawStopWord); FinishWord(); end; '"': begin Assert(not SawStopWord); FinishWord(); if (SawStopWord) then Error(); Start(tmQuoted); end; ',': begin Assert(not SawStopWord); FinishWord(); if (SawStopWord) then Error(); Comma(); end; end; tmQuoted: case Line[Index] of '"': begin Assert(not SawStopWord); FinishQuotedString(); end; end; else Assert(False); end; end; Index := Length(Line)+1; case Mode of tmStart: ; tmNumeric: FinishInteger(); tmWord: FinishWord(); tmQuoted: begin Error(); FinishWord(); end; else Assert(False); end; end; Assert(Token.TokenType = ttNone); end; procedure FreeLineTokens(var TokenArray: TTokenArray); var Index: Cardinal; begin if (Length(TokenArray) > 0) then begin for Index := Low(TokenArray) to High(TokenArray) do begin if (TokenArray[Index].TokenType = ttString) then Dispose(TokenArray[Index].StringValue); end; SetLength(TokenArray, 0); end; end; function IsValidUTF8(const Value: UTF8String): Boolean; var Index: Cardinal; function ConsumeContinuationByte(): Boolean; begin Inc(Index); Result := (Index <= Length(Value)) and (Ord(Value[Index]) in [$80..$BF]); end; var C: Char; Codepoint: Cardinal; begin Result := False; Index := 1; while (Index <= Length(Value)) do begin C := Value[Index]; case Ord(C) of $80..$BF: Exit; // unexpected continuation byte $C0, $C1: Exit; // forcibly an overlong sequence $C2..$DF: if (not ConsumeContinuationByte()) then Exit; // two-byte sequence $E0: // three-byte sequence with possible overlong sequence begin if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short Codepoint := ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); if (Codepoint < $0800) then Exit; // overlong end; $E1..$EC: if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // three-byte sequence $ED: // three-byte sequence with possible surrogates begin if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short Codepoint := $D000 + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); Assert(Codepoint < $E000); if (Codepoint >= $D800) then Exit; // surrogate end; $EE..$EF: if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // three-byte sequence $F0: // four-byte sequence with possible overlong sequence begin if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short Codepoint := ((Ord(Value[Index-2])-$80) shl 12) + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); if (Codepoint < $10000) then Exit; // overlong end; $F1..$F3: if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // four-byte sequence $F4: // four-byte sequence with possible too-high codepoint begin if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short Codepoint := $100000 + ((Ord(Value[Index-2])-$80) shl 12) + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80); if (Codepoint > $10FFFF) then Exit; // overlong end; $F5..$FD: Exit; // forcibly an invalid Unicode character $FE, $FF: Exit; // non-UTF-8 bytes end; Inc(Index); end; Result := True; end; end.