{$MODE OBJFPC} { -*- text -*- }
{$INCLUDE settings.inc}
unit stringutils;

interface

{$IFOPT C+}
const
   kBlank = High(QWord);
{$ENDIF}

// XXX support TGravitationalSystemValue with explicit required units, e.g. "1.4kg/m^3"
// XXX support reading these out like a stream

type
   {$IFOPT C+} TBlank = kBlank..kBlank; {$ENDIF}
   TTokenType = (ttNone, ttInteger, ttString, ttComma, ttError);
   TToken = record
      case TokenType: TTokenType of
       ttNone: ({$IFOPT C+} Blank: TBlank {$ENDIF});
       ttInteger: (IntegerValue: Integer);
       ttString: (StringValue: ^UTF8String);
       ttComma: ();
       ttError: ();
   end;
   TTokenArray = array of TToken;

function GetLineTokens(Line: UTF8String; StopWord: UTF8String = ''): TTokenArray;
procedure FreeLineTokens(var TokenArray: TTokenArray);
function IsValidUTF8(const Value: UTF8String): Boolean;

implementation

uses
   sysutils;

function GetLineTokens(Line: UTF8String; StopWord: UTF8String = ''): TTokenArray;
type
   TTokeniserMode = (tmStart, tmNumeric, tmWord, tmQuoted);
var
   Mode: TTokeniserMode;
   StartIndex, Index: Cardinal;
   Token: TToken;
   SawStopWord: Boolean;

   procedure Start(NewMode: TTokeniserMode);
   begin
      Assert(Token.TokenType = ttNone);
      StartIndex := Index;
      Mode := NewMode;
   end;

   procedure PushToken();
   begin
      Assert(Token.TokenType in [ttInteger, ttString, ttComma, ttError]);
      SetLength(Result, Length(Result)+1);
      Result[High(Result)] := Token;
      Token.TokenType := ttNone;
      {$IFOPT C+} Token.Blank := kBlank; {$ENDIF}
      Mode := tmStart;
   end;

   procedure Error();
   begin
      Token.TokenType := ttError;
      PushToken();
      Assert(Mode = tmStart);
   end;

   procedure FinishInteger();
   var
      FailPos: Integer;
   begin
      Assert(Token.TokenType = ttNone);
      Token.TokenType := ttInteger;
      FailPos := -1;
      {$IFOPT R-}
        {$DEFINE range_checks_off}
        {$RANGECHECKS ON}
      {$ENDIF}
      try
         Val(Copy(Line, StartIndex, Index-StartIndex), Token.IntegerValue, FailPos);
      except
         on ERangeError do ;
      end;
      {$IFDEF range_checks_off}
        {$RANGECHECKS OFF}
        {$UNDEF range_checks_off}
      {$ENDIF}
      if (FailPos <> 0) then
         Error()
      else
         PushToken();
      Assert(Mode = tmStart);
   end;

   procedure FinishWord();
   begin
      Assert(Token.TokenType = ttNone);
      Token.TokenType := ttString;
      Assert(Index > StartIndex);
      New(Token.StringValue);
      Assert(Token.StringValue^ = '');
      Token.StringValue^ := Copy(Line, StartIndex, Index-StartIndex);
      if (Token.StringValue^ = StopWord) then
      begin
         Assert(StopWord <> '');
         SawStopWord := True;
      end;
      PushToken();
      Assert(Mode = tmStart);
   end;

   procedure FinishQuotedString();
   begin
      Assert(Token.TokenType = ttNone);
      Token.TokenType := ttString;
      Assert(Index > StartIndex);
      Assert(Line[StartIndex] = '"');
      Assert(Line[Index] = '"');
      New(Token.StringValue);
      Assert(Token.StringValue^ = '');
      if (Index-StartIndex > 2) then
         Token.StringValue^ := Copy(Line, StartIndex+1, Index-StartIndex-1);
      PushToken();
      Assert(Mode = tmStart);
   end;

   procedure Comma();
   begin
      Assert(Token.TokenType = ttNone);
      Token.TokenType := ttComma;
      PushToken();
      Assert(Mode = tmStart);
   end;

begin
   SetLength(Result, 0);
   Token.TokenType := ttNone;
   {$IFOPT C+} Token.Blank := kBlank; {$ENDIF}
   if (Length(Line) > 0) then
   begin
      SawStopWord := False;
      Mode := tmStart;
      for Index := 1 to Length(Line) do
      begin
         case Mode of
          tmStart: case Line[Index] of
               ' ', #9: ;
         '0'..'9', '-': begin if (SawStopWord) then begin Start(tmWord); Break; end else Start(tmNumeric); end;
                   '"': begin if (SawStopWord) then Error(); Start(tmQuoted); end;
                   ',': Error();
                   else begin Start(tmWord); if (SawStopWord) then Break; end;
                  end;
          tmNumeric: case Line[Index] of
               ' ', #9: FinishInteger();
              '0'..'9': ;
                   '-': begin FinishInteger(); Error(); Start(tmNumeric); end;
                   '"': begin FinishInteger(); Error(); Start(tmQuoted); end;
                   ',': begin FinishInteger(); Comma(); end;
                   else begin FinishInteger(); Error(); Start(tmWord); if (SawStopWord) then Break; end;
                  end;
          tmWord: case Line[Index] of
               ' ', #9: begin Assert(not SawStopWord); FinishWord(); end;
                   '"': begin Assert(not SawStopWord); FinishWord(); if (SawStopWord) then Error(); Start(tmQuoted); end;
                   ',': begin Assert(not SawStopWord); FinishWord(); if (SawStopWord) then Error(); Comma(); end;
                  end;
          tmQuoted: case Line[Index] of
                   '"': begin Assert(not SawStopWord); FinishQuotedString(); end;
                  end;
          else Assert(False);
         end;
      end;
      Index := Length(Line)+1;
      case Mode of
       tmStart: ;
       tmNumeric: FinishInteger();
       tmWord: FinishWord();
       tmQuoted: begin Error(); FinishWord(); end;
       else Assert(False);
      end;
   end;
   Assert(Token.TokenType = ttNone);
end;

procedure FreeLineTokens(var TokenArray: TTokenArray);
var
   Index: Cardinal;
begin
   if (Length(TokenArray) > 0) then
   begin
      for Index := Low(TokenArray) to High(TokenArray) do
      begin
         if (TokenArray[Index].TokenType = ttString) then
            Dispose(TokenArray[Index].StringValue);
      end;
      SetLength(TokenArray, 0);
   end;
end;

function IsValidUTF8(const Value: UTF8String): Boolean;
var
   Index: Cardinal;

   function ConsumeContinuationByte(): Boolean;
   begin
      Inc(Index);
      Result := (Index <= Length(Value)) and (Ord(Value[Index]) in [$80..$BF]);
   end;

var
   C: Char;
   Codepoint: Cardinal;
begin
   Result := False;
   Index := 1;
   while (Index <= Length(Value)) do
   begin
      C := Value[Index];
      case Ord(C) of
         $80..$BF: Exit; // unexpected continuation byte
         $C0, $C1: Exit; // forcibly an overlong sequence
         $C2..$DF: if (not ConsumeContinuationByte()) then Exit; // two-byte sequence
         $E0: // three-byte sequence with possible overlong sequence
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short
              Codepoint := ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80);
              if (Codepoint < $0800) then
                 Exit; // overlong
           end;
         $E1..$EC: if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // three-byte sequence
         $ED: // three-byte sequence with possible surrogates
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short
              Codepoint := $D000 + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80);
              Assert(Codepoint < $E000);
              if (Codepoint >= $D800) then
                 Exit; // surrogate
           end;
         $EE..$EF: if (not (ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // three-byte sequence
         $F0: // four-byte sequence with possible overlong sequence
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short
              Codepoint := ((Ord(Value[Index-2])-$80) shl 12) + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80);
              if (Codepoint < $10000) then
                 Exit; // overlong
           end;
         $F1..$F3: if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // four-byte sequence
         $F4: // four-byte sequence with possible too-high codepoint
           begin
              if (not (ConsumeContinuationByte() and ConsumeContinuationByte() and ConsumeContinuationByte())) then Exit; // short
              Codepoint := $100000 + ((Ord(Value[Index-2])-$80) shl 12) + ((Ord(Value[Index-1])-$80) shl 6) + (Ord(Value[Index])-$80);
              if (Codepoint > $10FFFF) then
                 Exit; // overlong
           end;
         $F5..$FD: Exit; // forcibly an invalid Unicode character
         $FE, $FF: Exit; // non-UTF-8 bytes
      end;
      Inc(Index);
   end;
   Result := True;
end;

end.