summaryrefslogtreecommitdiff
path: root/rtl/unix
diff options
context:
space:
mode:
authorjonas <jonas@3ad0048d-3df7-0310-abae-a5850022a9f2>2013-08-19 22:04:25 +0000
committerjonas <jonas@3ad0048d-3df7-0310-abae-a5850022a9f2>2013-08-19 22:04:25 +0000
commitc414f428ef26e4ece2bf5f12de114b2657162447 (patch)
tree395b288502b96a1304bb678f37c2c99caff67d58 /rtl/unix
parent641ecbe408fb0fb32c25d0426cc958cfa46ad061 (diff)
downloadfpc-c414f428ef26e4ece2bf5f12de114b2657162447.tar.gz
* rawbytestring/unicodestring versions of findfirst/findnext/findclose
o these routines are now generic wrappers in filutil.inc, and call the platform-dependent internalfindfirst/next/close routines o on unix, the fnmatch routine got proper support for UTF-8 matching (e.g., it won't match a partial UTF-8 code point to a "?" wildcard) o for NativeNT, a similar (untested) UTF-16 version has been added + test for the above * rawbytestring/unicodestring versions of fileage git-svn-id: http://svn.freepascal.org/svn/fpc/branches/cpstrrtl@25302 3ad0048d-3df7-0310-abae-a5850022a9f2
Diffstat (limited to 'rtl/unix')
-rw-r--r--rtl/unix/sysutils.pp400
1 files changed, 288 insertions, 112 deletions
diff --git a/rtl/unix/sysutils.pp b/rtl/unix/sysutils.pp
index d9b9196ad1..c867af42da 100644
--- a/rtl/unix/sysutils.pp
+++ b/rtl/unix/sysutils.pp
@@ -548,13 +548,13 @@ begin
end;
end;
-
-Function FileAge (Const FileName : String): Longint;
-
-Var Info : Stat;
-
+Function FileAge (Const FileName : RawByteString): Longint;
+Var
+ Info : Stat;
+ SystemFileName: RawByteString;
begin
- If (fpstat (pointer(FileName),Info)<0) or fpS_ISDIR(info.st_mode) then
+ SystemFileName:=ToSingleByteFileSystemEncodedFileName(FileName);
+ If (fpstat(pchar(SystemFileName),Info)<0) or fpS_ISDIR(info.st_mode) then
exit(-1)
else
Result:=info.st_mtime;
@@ -611,75 +611,253 @@ Function FNMatch(const Pattern,Name:string):Boolean;
Var
LenPat,LenName : longint;
+ { assumes that pattern and name have the same code page }
+ function NameUtf8CodePointLen(index: longint): longint;
+ var
+ bytes: longint;
+ firstzerobit: byte;
+ begin
+ { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
+ Result:=1;
+ { multiple byte UTF-8 code point? }
+ if Name[index]>#127 then
+ begin
+ { bsr searches for the leftmost 1 bit. We are interested in the
+ leftmost 0 bit, so first invert the value
+ }
+ firstzerobit:=BsrByte(not(byte(Name[index])));
+ { if there is no zero bit or the first zero bit is the rightmost bit
+ (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
+ UTF-8-encoded string, and in the worst case bit 1 has to be zero)
+ }
+ if (firstzerobit=0) or (firstzerobit=255) then
+ exit;
+ { the number of bytes belonging to this code point is
+ 7-(pos first 0-bit). Subtract 1 since we're already at the first
+ byte. All subsequent bytes of the same sequence must have their
+ highest bit set and the next one unset. We stop when we detect an
+ invalid sequence.
+ }
+ bytes:=6-firstzerobit;
+ while (index+Result<=LenName) and
+ (bytes>0) and
+ ((ord(Name[index+Result]) and %10000000) = %10000000) do
+ begin
+ inc(Result);
+ dec(bytes);
+ end;
+ { stopped because of invalid sequence -> exit }
+ if bytes<>0 then
+ exit;
+ end;
+ { combining diacritics?
+ 1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
+ 2) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
+ 3) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
+ 4) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
+ }
+ repeat
+ bytes:=Result;
+ if (index+Result+1<=LenName) then
+ begin
+ { case 1) }
+ if ((ord(Name[index+Result]) and %11001100 = %11001100)) and
+ (ord(Name[index+Result+1]) >= %10000000) and
+ (ord(Name[index+Result+1]) <= %10101111) then
+ inc(Result,2)
+ { case 2), 3), 4) }
+ else if (index+Result+2<=LenName) and
+ (ord(Name[index+Result])>=%11100001) then
+ begin
+ { case 2) }
+ if ((ord(Name[index+Result])=%11100001) and
+ (ord(Name[index+Result+1])=%10110111) and
+ (ord(Name[index+Result+2])>=%10000000)) or
+ { case 3) }
+ ((ord(Name[index+Result])=%11100010) and
+ (ord(Name[index+Result+1])=%10000011) and
+ (ord(Name[index+Result+2])>=%10010000)) or
+ { case 4) }
+ ((ord(Name[index+Result])=%11101111) and
+ (ord(Name[index+Result+1])=%10111000) and
+ (ord(Name[index+Result+2])>=%10100000) and
+ (ord(Name[index+Result+2])<=%10101111)) then
+ inc(Result,3);
+ end;
+ end;
+ until bytes=Result;
+ end;
+
+ procedure GoToLastByteOfUtf8CodePoint(var j: longint);
+ begin
+ { Take one less, because we have to stop at the last byte of the sequence.
+ }
+ inc(j,NameUtf8CodePointLen(j)-1);
+ end;
+
+ { input:
+ i: current position in pattern (start of utf-8 code point)
+ j: current position in name (start of utf-8 code point)
+ update_i_j: should i and j be changed by the routine or not
+
+ output:
+ i: if update_i_j, then position of last matching part of code point in
+ pattern, or first non-matching code point in pattern. Otherwise the
+ same value as on input.
+ j: if update_i_j, then position of last matching part of code point in
+ name, or first non-matching code point in name. Otherwise the
+ same value as on input.
+ result: true if match, false if no match
+ }
+ function CompareUtf8CodePoint(var i,j: longint; update_i_j: boolean): Boolean;
+ var
+ bytes,
+ new_i,
+ new_j: longint;
+ begin
+ bytes:=NameUtf8CodePointLen(j);
+ new_i:=i;
+ new_j:=j;
+ { ensure that a part of an UTF-8 codepoint isn't interpreted
+ as '*' or '?' }
+ repeat
+ dec(bytes);
+ Result:=
+ (new_j<=LenName) and
+ (new_i<=LenPat) and
+ (Pattern[new_i]=Name[new_j]);
+ inc(new_i);
+ inc(new_j);
+ until not(Result) or
+ (bytes=0);
+ if update_i_j then
+ begin
+ i:=new_i;
+ j:=new_j;
+ end;
+ end;
+
+
Function DoFNMatch(i,j:longint):Boolean;
Var
- Found : boolean;
+ UTF8, Found : boolean;
Begin
- Found:=true;
- While Found and (i<=LenPat) Do
- Begin
- Case Pattern[i] of
- '?' : Found:=(j<=LenName);
- '*' : Begin
- {find the next character in pattern, different of ? and *}
- while Found do
- begin
- inc(i);
- if i>LenPat then Break;
- case Pattern[i] of
- '*' : ;
- '?' : begin
- if j>LenName then begin DoFNMatch:=false; Exit; end;
- inc(j);
- end;
- else
- Found:=false;
- end;
- end;
- Assert((i>LenPat) or ( (Pattern[i]<>'*') and (Pattern[i]<>'?') ));
- {Now, find in name the character which i points to, if the * or ?
- wasn't the last character in the pattern, else, use up all the
- chars in name}
- Found:=false;
- if (i<=LenPat) then
- begin
- repeat
- {find a letter (not only first !) which maches pattern[i]}
- while (j<=LenName) and (name[j]<>pattern[i]) do
- inc (j);
- if (j<LenName) then
+ Found:=true;
+ { ensure that we don't skip partial characters in UTF-8-encoded strings }
+ UTF8:=StringCodePage(Name)=CP_UTF8;
+ While Found and (i<=LenPat) Do
+ Begin
+ Case Pattern[i] of
+ '?' :
+ begin
+ Found:=(j<=LenName);
+ if UTF8 then
+ GoToLastByteOfUtf8CodePoint(j);
+ end;
+ '*' : Begin
+ {find the next character in pattern, different of ? and *}
+ while Found do
begin
- if DoFnMatch(i+1,j+1) then
- begin
- i:=LenPat;
- j:=LenName;{we can stop}
- Found:=true;
+ inc(i);
+ if i>LenPat then
Break;
- end else
- inc(j);{We didn't find one, need to look further}
- end else
- if j=LenName then
+ case Pattern[i] of
+ '*' : ;
+ '?' : begin
+ if j>LenName then
+ begin
+ DoFNMatch:=false;
+ Exit;
+ end;
+ if UTF8 then
+ GoToLastByteOfUtf8CodePoint(j);
+ inc(j);
+ end;
+ else
+ Found:=false;
+ end;
+ end;
+ Assert((i>LenPat) or ( (Pattern[i]<>'*') and (Pattern[i]<>'?') ));
+ { Now, find in name the character which i points to, if the * or
+ ? wasn't the last character in the pattern, else, use up all
+ the chars in name }
+ Found:=false;
+ if (i<=LenPat) then
begin
+ repeat
+ {find a letter (not only first !) which maches pattern[i]}
+ if UTF8 then
+ begin
+ while (j<=LenName) and
+ ((name[j]<>pattern[i]) or
+ not CompareUtf8CodePoint(i,j,false)) do
+ begin
+ GoToLastByteOfUtf8CodePoint(j);
+ inc(j);
+ end;
+ end
+ else
+ begin
+ while (j<=LenName) and (name[j]<>pattern[i]) do
+ inc (j);
+ end;
+ if (j<LenName) then
+ begin
+ { while positions i/j have already been checked, in
+ case of UTF-8 we have to ensure that we don't split
+ a code point. Otherwise we can skip over comparing
+ the same characters twice }
+ if DoFnMatch(i+ord(not UTF8),j+ord(not UTF8)) then
+ begin
+ i:=LenPat;
+ j:=LenName;{we can stop}
+ Found:=true;
+ Break;
+ end
+ { We didn't find one, need to look further }
+ else
+ begin
+ if UTF8 then
+ GoToLastByteOfUtf8CodePoint(j);
+ inc(j);
+ end;
+ end
+ else if j=LenName then
+ begin
+ Found:=true;
+ Break;
+ end;
+ { This 'until' condition must be j>LenName, not j>=LenName.
+ That's because when we 'need to look further' and
+ j = LenName then loop must not terminate. }
+ until (j>LenName);
+ end
+ else
+ begin
+ j:=LenName;{we can stop}
Found:=true;
- Break;
end;
- { This 'until' condition must be j>LenName, not j>=LenName.
- That's because when we 'need to look further' and
- j = LenName then loop must not terminate. }
- until (j>LenName);
- end else
- begin
- j:=LenName;{we can stop}
- Found:=true;
end;
- end;
- else {not a wildcard character in pattern}
- Found:=(j<=LenName) and (pattern[i]=name[j]);
+ #128..#255:
+ begin
+ Found:=(j<=LenName) and (pattern[i]=name[j]);
+ if Found and UTF8 then
+ begin
+ { ensure that a part of an UTF-8 codepoint isn't matched with
+ '*' or '?' }
+ Found:=CompareUtf8CodePoint(i,j,true);
+ { at this point, either Found is false (and we'll stop), or
+ both pattern[i] and name[j] are the end of the current code
+ point and equal }
+ end
+ end
+ else {not a wildcard character in pattern}
+ Found:=(j<=LenName) and (pattern[i]=name[j]);
+ end;
+ inc(i);
+ inc(j);
end;
- inc(i);
- inc(j);
- end;
- DoFnMatch:=Found and (j>LenName);
+ DoFnMatch:=Found and (j>LenName);
end;
Begin {start FNMatch}
@@ -693,78 +871,73 @@ Type
TUnixFindData = Record
NamePos : LongInt; {to track which search this is}
DirPtr : Pointer; {directory pointer for reading directory}
- SearchSpec : String;
+ SearchSpec : RawbyteString;
SearchType : Byte; {0=normal, 1=open will close, 2=only 1 file}
SearchAttr : Byte; {attribute we are searching for}
End;
PUnixFindData = ^TUnixFindData;
-Procedure FindClose(Var f: TSearchRec);
+Procedure InternalFindClose(var Handle: Pointer);
var
- UnixFindData : PUnixFindData;
-Begin
- UnixFindData:=PUnixFindData(f.FindHandle);
- If (UnixFindData=Nil) then
+ D: PUnixFindData absolute Handle;
+begin
+ If D=Nil then
Exit;
- if UnixFindData^.SearchType=0 then
+ if D^.SearchType=0 then
begin
- if UnixFindData^.dirptr<>nil then
- fpclosedir(pdir(UnixFindData^.dirptr)^);
+ if D^.dirptr<>nil then
+ fpclosedir(pdir(D^.dirptr)^);
end;
- Dispose(UnixFindData);
- f.FindHandle:=nil;
-End;
+ Dispose(D);
+ D:=nil;
+end;
-Function FindGetFileInfo(const s:string;var f:TSearchRec):boolean;
-var
- st : baseunix.stat;
- WinAttr : longint;
-
+Function FindGetFileInfo(const s: RawByteString; var f: TAbstractSearchRec; var Name: RawByteString):boolean;
+Var
+ st : baseunix.stat;
+ WinAttr : longint;
begin
- FindGetFileInfo:=false;
- If Assigned(F.FindHandle) and ((((PUnixFindData(f.FindHandle)^.searchattr)) and faSymlink) > 0) then
+ if Assigned(f.FindHandle) and ( (PUnixFindData(F.FindHandle)^.searchattr and faSymlink) > 0) then
FindGetFileInfo:=(fplstat(pointer(s),st)=0)
else
FindGetFileInfo:=(fpstat(pointer(s),st)=0);
- If not FindGetFileInfo then
+ if not FindGetFileInfo then
exit;
WinAttr:=LinuxToWinAttr(s,st);
- If ((WinAttr and Not(PUnixFindData(f.FindHandle)^.searchattr))=0) Then
- Begin
- f.Name:=ExtractFileName(s);
- f.Attr:=WinAttr;
- f.Size:=st.st_Size;
- f.Mode:=st.st_mode;
- f.Time:=st.st_mtime;
- FindGetFileInfo:=true;
- End
- else
- FindGetFileInfo:=false;
+ FindGetFileInfo:=(WinAttr and Not(PUnixFindData(f.FindHandle)^.searchattr))=0;
+
+ if FindGetFileInfo then
+ begin
+ Name:=ExtractFileName(s);
+ f.Attr:=WinAttr;
+ f.Size:=st.st_Size;
+ f.Mode:=st.st_mode;
+ f.Time:=st.st_mtime;
+ FindGetFileInfo:=true;
+ end;
end;
-Function FindNext (Var Rslt : TSearchRec) : Longint;
-{
- re-opens dir if not already in array and calls FindGetFileInfo
-}
+// Returns the FOUND filename. Error code <> 0 if no file found
+Function InternalFindNext (var Rslt : TAbstractSearchRec; var Name : RawByteString) : Longint;
+
Var
- DirName : String;
+ DirName : RawByteString;
FName,
- SName : string;
+ SName : RawBytestring;
Found,
Finished : boolean;
p : pdirent;
UnixFindData : PUnixFindData;
+
Begin
Result:=-1;
UnixFindData:=PUnixFindData(Rslt.FindHandle);
{ SearchSpec='' means that there were no wild cards, so only one file to
find.
}
- If (UnixFindData=Nil) then
- exit;
- if UnixFindData^.SearchSpec='' then
+ If (UnixFindData=Nil) or (UnixFindData^.SearchSpec='') then
exit;
if (UnixFindData^.SearchType=0) and
(UnixFindData^.Dirptr=nil) then
@@ -773,7 +946,7 @@ Begin
DirName:='./'
Else
DirName:=Copy(UnixFindData^.SearchSpec,1,UnixFindData^.NamePos);
- UnixFindData^.DirPtr := fpopendir(Pchar(pointer(DirName)));
+ UnixFindData^.DirPtr := fpopendir(Pchar(DirName));
end;
SName:=Copy(UnixFindData^.SearchSpec,UnixFindData^.NamePos+1,Length(UnixFindData^.SearchSpec));
Found:=False;
@@ -789,9 +962,10 @@ Begin
Finished:=True
Else
Begin
+ SetCodePage(FName,DefaultFileSystemCodePage,false);
If FNMatch(SName,FName) Then
Begin
- Found:=FindGetFileInfo(Copy(UnixFindData^.SearchSpec,1,UnixFindData^.NamePos)+FName,Rslt);
+ Found:=FindGetFileInfo(Copy(UnixFindData^.SearchSpec,1,UnixFindData^.NamePos)+FName,Rslt,Name);
if Found then
begin
Result:=0;
@@ -803,7 +977,7 @@ Begin
End;
-Function FindFirst (Const Path : String; Attr : Longint; out Rslt : TSearchRec) : Longint;
+Function InternalFindFirst (Const Path : RawByteString; Attr : Longint; out Rslt : TAbstractSearchRec; var Name: RawByteString) : Longint;
{
opens dir and calls FindNext if needed.
}
@@ -811,6 +985,8 @@ var
UnixFindData : PUnixFindData;
Begin
Result:=-1;
+ { this is safe even though Rslt actually contains a refcounted field, because
+ it is declared as "out" and hence has already been initialised }
fillchar(Rslt,sizeof(Rslt),0);
if Path='' then
exit;
@@ -823,20 +999,20 @@ Begin
{Wildcards?}
if (Pos('?',Path)=0) and (Pos('*',Path)=0) then
begin
- if FindGetFileInfo(Path,Rslt) then
+ if FindGetFileInfo(ToSingleByteFileSystemEncodedFileName(Path),Rslt,Name) then
Result:=0;
end
else
begin
{Create Info}
- UnixFindData^.SearchSpec := Path;
+ UnixFindData^.SearchSpec := ToSingleByteFileSystemEncodedFileName(Path);
UnixFindData^.NamePos := Length(UnixFindData^.SearchSpec);
while (UnixFindData^.NamePos>0) and (UnixFindData^.SearchSpec[UnixFindData^.NamePos]<>'/') do
dec(UnixFindData^.NamePos);
- Result:=FindNext(Rslt);
+ Result:=InternalFindNext(Rslt,Name);
end;
If (Result<>0) then
- FindClose(Rslt);
+ InternalFindClose(Rslt.FindHandle);
End;