Home About Units Download Documents Links Contact SourceForge
Units: UnicodeReader: Source

{                                                                              }
{                        Unicode Reader class v3.05                            }
{                                                                              }
{             This unit is copyright © 2002-2004 by David J Butler             }
{                                                                              }
{                  This unit is part of Delphi Fundamentals.                   }
{                Its original file name is cUnicodeReader.pas                  }
{       The latest version is available from the Fundamentals home page        }
{                     http://fundementals.sourceforge.net/                     }
{                                                                              }
{                I invite you to use this unit, free of charge.                }
{        I invite you to distibute this unit, but it must be for free.         }
{             I also invite you to contribute to its development,              }
{             but do not distribute a modified copy of this file.              }
{                                                                              }
{          A forum is available on SourceForge for general discussion          }
{             http://sourceforge.net/forum/forum.php?forum_id=2117             }
{                                                                              }
{                                                                              }
{ Description:                                                                 }
{   Unicode reader class.                                                      }
{                                                                              }
{ Revision history:                                                            }
{   19/04/2002  0.01  Initial version                                          }
{   28/10/2002  3.02  Refactored for Fundamentals 3.                           }
{   29/10/2002  3.03  Bug fixes and improvements.                              }
{   05/11/2002  3.04  Improved buffer handling.                                }
{   02/01/2004  3.05  Changed reader's block size to 64K as suggested by Eb.   }
{                                                                              }

{$INCLUDE ..\cDefines.inc}
unit cUnicodeReader;

interface

uses
  { Fundamentals }
  cUtils,
  cReaders,
  cUnicodeChar,
  cUnicodeCodecs,
  cUnicode;



{                                                                              }
{ TUnicodeReader                                                               }
{                                                                              }
type
  TUnicodeReader = class
  protected
    FReader      : AReader;
    FReaderOwner : Boolean;
    FReaderPos   : Int64;
    FCodec       : TCustomUnicodeCodec;
    FCodecOwner  : Boolean;
    FBuffer      : WideString;
    FBufPos      : Integer;
    FBufLen      : Integer;
    FRawBuf      : Pointer;
    FRawSize     : Integer;

    procedure ReadError;
    function  BufferChars(const Count: Integer): Integer;
    function  GetBuffer(const Count: Integer): Boolean;

  public
    constructor Create(const Reader: AReader;
                const ReaderOwner: Boolean = True;
                const Codec: TCustomUnicodeCodec = nil;
                const CodecOwner: Boolean = True);
    destructor Destroy; override;

    property  Codec: TCustomUnicodeCodec read FCodec;
    property  CodecOwner: Boolean read FCodecOwner write FCodecOwner;

    procedure Reset;
    function  EOF: Boolean;

    function  ReadChar: WideChar;
    function  ReadWide(const Buf: PWideChar; const Len: Integer): Integer;
    function  ReadWideStr(const Len: Integer): WideString;
    function  ReadUTF8Str(const Len: Integer): String;

    procedure Skip(const Count: Integer);
    function  SkipAll(const CharMatchFunc: WideCharMatchFunction): Integer;

    function  MatchChar(const CharMatchFunc: WideCharMatchFunction;
              const Skip: Boolean): Boolean;
    function  MatchWideChar(const Ch: WideChar; const Skip: Boolean): Boolean;

    function  MatchAnsiStr(const S: String; const CaseSensitive: Boolean;
              const Skip: Boolean): Boolean;
    function  MatchAnsiStrDelimited(const S: String;
              const CaseSensitive: Boolean;
              const Delimiter: WideCharMatchFunction;
              const Skip: Boolean): Boolean;

    function  MatchChars(const CharMatchFunc: WideCharMatchFunction): Integer;
    function  MatchAnsiChars(const C: CharSet): Integer;

    function  LocateAnsiChar(const C: CharSet;
              const Optional: Boolean = False): Integer;
    function  LocateAnsiStr(const S: String; const CaseSensitive: Boolean;
              const Optional: Boolean = False): Integer;

    function  PeekChar: WideChar;
    function  SkipAndPeek(var Ch: WideChar): Boolean;
    function  GetPeekBuffer(const Len: Integer; var Buffer: PWideChar): Integer;

    function  ReadChars(const CharMatchFunc: WideCharMatchFunction): WideString;
    function  ReadAnsiChars(const C: CharSet): String;

    function  SkipToAnsiChar(const C: CharSet;
              const SkipDelimiter: Boolean): Integer;
    function  ReadToAnsiChar(const C: CharSet;
              const SkipDelimiter: Boolean = False): WideString;
    function  ReadUTF8StrToAnsiChar(const C: CharSet;
              const SkipDelimiter: Boolean = False): String;

    function  ReadToAnsiStr(const S: String;
              const CaseSensitive: Boolean = True;
              const SkipDelimiter: Boolean = False): WideString;
    function  ReadUTF8StrToAnsiStr(const S: String;
              const CaseSensitive: Boolean = True;
              const SkipDelimiter: Boolean = False): WideString;
  end;
  EUnicodeReader = class(EUnicode);
  EUnicodeReaderReadError = class(EUnicodeReader);



{                                                                              }
{ TUnicodeMemoryReader                                                         }
{                                                                              }
type
  TUnicodeMemoryReader = class(TUnicodeReader)
  public
    constructor Create(const Data: Pointer; const Size: Integer;
                const Codec: TCustomUnicodeCodec = nil;
                const CodecOwner: Boolean = True);
  end;



{                                                                              }
{ TUnicodeFileReader                                                           }
{                                                                              }
type
  TUnicodeFileReader = class(TUnicodeReader)
  public
    constructor Create(const FileName: String;
                const Codec: TCustomUnicodeCodec = nil;
                const CodecOwner: Boolean = True);
  end;



implementation

uses
  { Delphi }
  SysUtils;



resourcestring
  RSReadError = 'Read error';



{                                                                              }
{ TUnicodeReader                                                               }
{                                                                              }
const
  ReaderBlockSize = 65536; // 64K

constructor TUnicodeReader.Create(const Reader: AReader;
    const ReaderOwner: Boolean;
    const Codec: TCustomUnicodeCodec;
    const CodecOwner: Boolean);
begin
  inherited Create;
  Assert(Assigned(Reader));
  FReader := Reader;
  FReaderOwner := ReaderOwner;
  FReaderPos := Reader.Position;
  FCodec := Codec;
  FCodecOwner := CodecOwner;
  GetMem(FRawBuf, ReaderBlockSize);
end;

destructor TUnicodeReader.Destroy;
begin
  if Assigned(FRawBuf) then
    FreeMem(FRawBuf);
  if FReaderOwner then
    FreeAndNil(FReader);
  if FCodecOwner then
    FreeAndNil(FCodec);
  inherited Destroy;
end;

procedure TUnicodeReader.ReadError;
begin
  raise EUnicodeReaderReadError.Create(RSReadError);
end;

procedure TUnicodeReader.Reset;
begin
  FReader.Position := FReaderPos;
  FBufPos := 0;
  FBufLen := 0;
  // Free excessively large buffer, keep part of it for re-use
  if Length(FBuffer) > 4 * ReaderBlockSize then
    SetLength(FBuffer, 4 * ReaderBlockSize);
end;

function TUnicodeReader.EOF: Boolean;
begin
  if FBufPos < FBufLen then
    Result := False
  else
    Result := FReader.EOF;
end;

function TUnicodeReader.BufferChars(const Count: Integer): Integer;
var I, J, L, M, N: Integer;
    P: PByte;
    Q: PWideChar;
begin
  // Check available characters
  Result := FBufLen - FBufPos;
  if Result >= Count then
    exit;
  L := Length(FBuffer);
  if L > 0 then
    begin
      // Reorganise buffer
      if Result <= 0 then // buffer empty
        begin
          // move pointer to front
          FBufPos := 0;
          FBufLen := 0;
        end else
      if (Result <= ReaderBlockSize div 16) or // buffer is nearly empty; or
         (FBufPos >= 4 * ReaderBlockSize) then // buffer has too much unused space at front
        begin
          // Move data to front
          Q := Pointer(FBuffer);
          Inc(Q, FBufPos);
          Move(Q^, Pointer(FBuffer)^, Result * Sizeof(WideChar));
          FBufPos := 0;
          FBufLen := Result;
        end;
    end;
  // Fill unicode buffer
  Repeat
    // Fill raw character buffer
    P := FRawBuf;
    Inc(P, FRawSize);
    J := FReader.Read(P^, ReaderBlockSize - FRawSize);
    if J <= 0 then // eof
      exit;
    Inc(FRawSize, J);
    // Decode to unicode buffer
    if Assigned(FCodec) then
      begin
        // Decode raw buffer using codec
        P := FRawBuf;
        J := FRawSize;
        L := Length(FBuffer) - FBufLen;
        Repeat
          if L < ReaderBlockSize then
            begin
              // grow unicode buffer to fit at least one raw buffer
              L := ReaderBlockSize;
              SetLength(FBuffer, FBufLen + L);
            end;
          Q := Pointer(FBuffer);
          Inc(Q, FBufLen);
          FCodec.Decode(P, J, Q, L * Sizeof(WideChar), M, N);
          Inc(P, M);
          Dec(J, M);
          Inc(FBufLen, N);
          Dec(L, N);
        Until (J <= 0) or (L > 0);
        I := FRawSize - J;
      end
    else
      begin
        // read raw 16-bit unicode
        I := FRawSize div Sizeof(WideChar);
        L := Length(FBuffer) - FBufLen;
        if L < I then
          begin
            L := I;
            SetLength(FBuffer, FBufLen + L);
          end;
        Q := Pointer(FBuffer);
        Inc(Q, FBufLen);
        Inc(FBufLen, I);
        I := I * Sizeof(WideChar);
        Move(FRawBuf^, Q^, I);
      end;
    // Move undecoded raw data to front of buffer
    if I < FRawSize then
      begin
        Move(P^, FRawBuf^, FRawSize - I);
        Dec(FRawSize, I);
      end
    else
      FRawSize := 0;
    // Check if enough characters have been buffered
    Result := FBufLen - FBufPos;
  Until Result >= Count;
end;

function TUnicodeReader.GetBuffer(const Count: Integer): Boolean;
begin
  Result := FBufLen - FBufPos >= Count;
  if Result then
    exit;
  Result := BufferChars(Count) >= Count;
end;

function TUnicodeReader.ReadWide(const Buf: PWideChar;
    const Len: Integer): Integer;
var P: PWideChar;
begin
  if Len <= 0 then
    begin
      Result := 0;
      exit;
    end;
  // buffer
  Result := FBufLen - FBufPos;
  if Result < Len then
    Result := BufferChars(Len);
  if Result > Len then
    Result := Len;
  // read
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Move(P^, Buf^, Sizeof(WideChar) * Result);
  Inc(FBufPos, Result);
end;

function TUnicodeReader.ReadWideStr(const Len: Integer): WideString;
var L: Integer;
    P: PWideChar;
begin
  if Len <= 0 then
    begin
      Result := '';
      exit;
    end;
  // buffer
  L := FBufLen - FBufPos;
  if L < Len then
    L := BufferChars(Len);
  if L > Len then
    L := Len;
  // read
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  SetLength(Result, L);
  Move(P^, Pointer(Result)^, Sizeof(WideChar) * L);
  Inc(FBufPos, L);
end;

function TUnicodeReader.ReadUTF8Str(const Len: Integer): String;
var L: Integer;
    P: PWideChar;
begin
  if Len <= 0 then
    begin
      Result := '';
      exit;
    end;
  // buffer
  L := FBufLen - FBufPos;
  if L < Len then
    L := BufferChars(Len);
  if L > Len then
    L := Len;
  // read
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Result := WideBufToUTF8String(P, L);
  Inc(FBufPos, L);
end;

procedure TUnicodeReader.Skip(const Count: Integer);
begin
  // buffer
  if Count <= 0 then
    exit;
  if FBufLen - FBufPos < Count then
    if not GetBuffer(Count) then
      ReadError;
  // skip
  Inc(FBufPos, Count);
end;

function TUnicodeReader.SkipAll(const CharMatchFunc: WideCharMatchFunction): Integer;
var P: PWideChar;
    N, I: Integer;
begin
  Result := 0;
  // buffer
  N := FBufLen - FBufPos;
  if N <= 0 then
    N := BufferChars(1);
  Repeat
    if N <= 0 then // eof
      exit;
    // skip
    P := Pointer(FBuffer);
    Inc(P, FBufPos);
    For I := 1 to N do
      if not CharMatchFunc(P^) then
        exit else
        begin
          Inc(Result);
          Inc(FBufPos);
          Inc(P);
        end;
    // buffer more
    N := BufferChars(1);
  Until False;
end;

function TUnicodeReader.MatchChar(const CharMatchFunc: WideCharMatchFunction;
    const Skip: Boolean): Boolean;
var P: PWideChar;
begin
  // buffer
  if FBufPos >= FBufLen then
    if BufferChars(1) <= 0 then // eof
      begin
        Result := False;
        exit;
      end;
  // match
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Result := CharMatchFunc(P^);
  // skip
  if Skip and Result then
    Inc(FBufPos);
end;

function TUnicodeReader.MatchWideChar(const Ch: WideChar;
    const Skip: Boolean): Boolean;
var P: PWideChar;
begin
  // buffer
  if FBufPos >= FBufLen then
    if BufferChars(1) <= 0 then // eof
      begin
        Result := False;
        exit;
      end;
  // match
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Result := P^ = Ch;
  // skip
  if Skip and Result then
    Inc(FBufPos);
end;

function TUnicodeReader.MatchAnsiStr(const S: String;
    const CaseSensitive: Boolean; const Skip: Boolean): Boolean;
var L: Integer;
    P: PWideChar;
begin
  L := Length(S);
  if L = 0 then
    begin
      Result := False;
      exit;
    end;
  // buffer
  if FBufLen - FBufPos < L then
    if BufferChars(L) < L then // eof
      begin
        Result := False;
        exit;
      end;
  // match
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Result := WidePMatchAnsiStr(S, P, CaseSensitive);
  // skip
  if Skip and Result then
    Inc(FBufPos, L);
end;

function TUnicodeReader.MatchAnsiStrDelimited(const S: String;
    const CaseSensitive: Boolean; const Delimiter: WideCharMatchFunction;
    const Skip: Boolean): Boolean;
var L: Integer;
    P: PWideChar;
begin
  L := Length(S);
  // buffer
  if FBufLen - FBufPos < L + 1 then
    if BufferChars(L + 1) < L + 1 then // eof
      begin
        Result := False;
        exit;
      end;
  // match
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Result := WidePMatchAnsiStr(S, P, CaseSensitive);
  if not Result then
    exit;
  Inc(P, L);
  Result := Delimiter(P^);
  // skip
  if Skip and Result then
    Inc(FBufPos, L);
end;

function TUnicodeReader.MatchChars(const CharMatchFunc: WideCharMatchFunction): Integer;
var P: PWideChar;
    N, I: Integer;
begin
  Result := 0;
  // buffer
  N := FBufLen - FBufPos;
  if N <= 0 then
    N := BufferChars(1);
  Repeat
    if N < Result + 1 then // eof
      exit;
    // match
    P := Pointer(FBuffer);
    Inc(P, FBufPos + Result);
    For I := Result + 1 to N do
      if not CharMatchFunc(P^) then
        exit else
        begin
          Inc(Result);
          Inc(P);
        end;
    // buffer more
    N := BufferChars(Result + 1);
  Until False;
end;

function TUnicodeReader.MatchAnsiChars(const C: CharSet): Integer;
var P: PWideChar;
    N, I: Integer;
begin
  Result := 0;
  // buffer
  N := FBufLen - FBufPos;
  if N <= 0 then
    N := BufferChars(1);
  Repeat
    if N < Result + 1 then // eof
      exit;
    // match
    P := Pointer(FBuffer);
    Inc(P, FBufPos + Result);
    For I := Result + 1 to N do
      if (Ord(P^) > $FF) or not (Char(Byte(P^)) in C) then
        exit else
        begin
          Inc(Result);
          Inc(P);
        end;
    // buffer more
    N := BufferChars(Result + 1);
  Until False;
end;

function TUnicodeReader.LocateAnsiChar(const C: CharSet;
    const Optional: Boolean): Integer;
var P: PWideChar;
    N, I: Integer;
    V: Word;
begin
  Result := 0;
  // buffer
  N := FBufLen - FBufPos;
  if N <= 0 then
    N := BufferChars(1);
  Repeat
    if N < Result + 1 then
      begin
        // eof
        if Optional then
          Result := N else
          Result := -1;
        exit;
      end;
    // locate
    P := Pointer(FBuffer);
    Inc(P, FBufPos + Result);
    For I := Result + 1 to N do
      begin
        V := Ord(P^);
        if (V <= $FF) and (Char(V) in C) then
          // found
          exit;
        Inc(Result);
        Inc(P);
      end;
    // buffer more
    N := BufferChars(Result + 1);
  Until False;
end;

function TUnicodeReader.LocateAnsiStr(const S: String;
    const CaseSensitive: Boolean;
    const Optional: Boolean): Integer;
var P: PWideChar;
    M, N, I: Integer;
begin
  Result := 0;
  M := Length(S);
  if M = 0 then
    exit;
  // buffer
  N := FBufLen - FBufPos;
  if N < M then
    N := BufferChars(M);
  Repeat
    if N < Result + M then
      begin
        // eof
        if Optional then
          Result := N else
          Result := -1;
        exit;
      end;
    P := Pointer(FBuffer);
    Inc(P, FBufPos + Result);
    For I := Result + 1 to N - M + 1 do
      if WidePMatchAnsiStr(S, P, CaseSensitive) then
        // found
        exit else
        begin
          Inc(Result);
          Inc(P);
        end;
    // buffer more characters
    N := BufferChars(Result + M);
  Until False;
end;

function TUnicodeReader.PeekChar: WideChar;
var P: PWideChar;
begin
  // buffer
  if FBufPos >= FBufLen then
    if not GetBuffer(1) then
      ReadError;
  // peek
  P := Pointer(FBuffer);
  Inc(P, FBufPos);
  Result := P^;
end;

function TUnicodeReader.GetPeekBuffer(const Len: Integer;
    var Buffer: PWideChar): Integer;
var P: PWideChar;
begin
  // Result returns the number of wide characters in Buffer.
  // Buffer points to the actual data. The buffer is only valid until the next
  // call to the reader.
  Result := BufferChars(Len);
  if Result = 0 then
    Buffer := nil else
    begin
      P := Pointer(FBuffer);
      Inc(P, FBufPos);
      Buffer := P;
    end;
end;

function TUnicodeReader.ReadChar: WideChar;
var P: PWideChar;
    O: Integer;
begin
  // buffer
  O := FBufPos;
  if O >= FBufLen then
    if GetBuffer(1) then
      O := FBufPos else
      ReadError;
  // read
  P := Pointer(FBuffer);
  Inc(P, O);
  Result := P^;
  Inc(FBufPos);
end;

function TUnicodeReader.SkipAndPeek(var Ch: WideChar): Boolean;
var P: PWideChar;
    C: Integer;
begin
  // Skip
  C := FBufLen - FBufPos;
  if C >= 2 then
    begin
      Inc(FBufPos);
      Result := True;
    end else
    begin
      Result := GetBuffer(2);
      if FBufPos < FBufLen then
        Inc(FBufPos);
    end;
  if Result then
    begin
      // Peek
      P := Pointer(FBuffer);
      Inc(P, FBufPos);
      Ch := P^;
    end else
    Ch := WideChar(#0);
end;

function TUnicodeReader.ReadChars(const CharMatchFunc: WideCharMatchFunction): WideString;
var P: PWideChar;
    L: Integer;
begin
  // calculate length
  L := MatchChars(CharMatchFunc);
  if L = 0 then
    Result := '' else
    begin
      // read
      SetLength(Result, L);
      P := Pointer(FBuffer);
      Inc(P, FBufPos);
      Move(P^, Pointer(Result)^, Sizeof(WideChar) * L);
      Inc(FBufPos, L);
    end;
end;

function TUnicodeReader.ReadAnsiChars(const C: CharSet): String;
var P : PWideChar;
    L : Integer;
begin
  // calculate length
  L := MatchAnsiChars(C);
  if L = 0 then
    Result := '' else
    begin
      // read
      SetLength(Result, L);
      P := Pointer(FBuffer);
      Inc(P, FBufPos);
      Result := WideToLongString(P, L);
      Inc(FBufPos, L);
    end;
end;

function TUnicodeReader.SkipToAnsiChar(const C: CharSet;
    const SkipDelimiter: Boolean): Integer;
var L: Integer;
begin
  // locate
  L := LocateAnsiChar(C, False);
  if L = 0 then
    Result := 0 else
    begin
      // skip characters
      if L < 0 then
        Result := FBufLen - FBufPos else
        Result := L;
      Inc(FBufPos, Result);
    end;
  // skip delimiter
  if (L >= 0) and SkipDelimiter then
    Inc(FBufPos);
end;

function TUnicodeReader.ReadToAnsiChar(const C: CharSet;
    const SkipDelimiter: Boolean): WideString;
var L, M: Integer;
begin
  // locate
  L := LocateAnsiChar(C, False);
  if L = 0 then
    Result := '' else
    begin
      // read
      if L < 0 then
        M := FBufLen - FBufPos else
        M := L;
      Result := ReadWideStr(M);
    end;
  // skip delimiter
  if (L >= 0) and SkipDelimiter then
    Inc(FBufPos);
end;

function TUnicodeReader.ReadUTF8StrToAnsiChar(const C: CharSet;
    const SkipDelimiter: Boolean): String;
var L, M: Integer;
begin
  // locate
  L := LocateAnsiChar(C, False);
  if L = 0 then
    Result := '' else
    begin
      // read
      if L < 0 then
        M := FBufLen - FBufPos else
        M := L;
      Result := ReadUTF8Str(M);
    end;
  // skip delimiter
  if (L >= 0) and SkipDelimiter then
    Inc(FBufPos);
end;

function TUnicodeReader.ReadToAnsiStr(const S: String;
    const CaseSensitive: Boolean; const SkipDelimiter: Boolean): WideString;
var L, M: Integer;
begin
  // locate
  L := LocateAnsiStr(S, CaseSensitive, False);
  if L = 0 then
    Result := '' else
    begin
      // read
      if L < 0 then
        M := FBufLen - FBufPos else
        M := L;
      Result := ReadWideStr(M);
    end;
  // skip delimiter
  if (L >= 0) and SkipDelimiter then
    Inc(FBufPos, Length(S));
end;

function TUnicodeReader.ReadUTF8StrToAnsiStr(const S: String;
    const CaseSensitive: Boolean; const SkipDelimiter: Boolean): WideString;
var L, M: Integer;
begin
  // locate
  L := LocateAnsiStr(S, CaseSensitive, False);
  if L = 0 then
    Result := '' else
    begin
      // read
      if L < 0 then
        M := FBufLen - FBufPos else
        M := L;
      Result := ReadUTF8Str(M);
    end;
  // skip delimiter
  if (L >= 0) and SkipDelimiter then
    Inc(FBufPos, Length(S));
end;



{                                                                              }
{ TUnicodeMemoryReader                                                         }
{                                                                              }
constructor TUnicodeMemoryReader.Create(const Data: Pointer; const Size: Integer;
    const Codec: TCustomUnicodeCodec; const CodecOwner: Boolean);
begin
  inherited Create(TMemoryReader.Create(Data, Size), True, Codec, CodecOwner);
end;



{                                                                              }
{ TUnicodeFileReader                                                           }
{                                                                              }
constructor TUnicodeFileReader.Create(const FileName: String;
    const Codec: TCustomUnicodeCodec; const CodecOwner: Boolean);
begin
  inherited Create(TFileReader.Create(FileName), True, Codec, CodecOwner);
end;



end.