summaryrefslogtreecommitdiff
path: root/lib/Basic/SourceManager.cpp
diff options
context:
space:
mode:
authorChris Lattner <sabre@nondot.org>2010-04-20 18:14:03 +0000
committerChris Lattner <sabre@nondot.org>2010-04-20 18:14:03 +0000
commit38caec48bc1c9816ca59b8d164a64447ee208c2e (patch)
tree05cc5f0627b5ea5c3dd380bb12354b3fd0a58376 /lib/Basic/SourceManager.cpp
parent3e79c30807c516e8d32e4ed08408b30605df5997 (diff)
downloadclang-38caec48bc1c9816ca59b8d164a64447ee208c2e.tar.gz
enhance sourcemgr to detect various UTF BOM's and emit a fatal error
about it instead of producing tons of garbage from the lexer. It would be even better for sourcemgr to dynamically transcode (e.g. from UTF16 -> UTF8). git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@101924 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Basic/SourceManager.cpp')
-rw-r--r--lib/Basic/SourceManager.cpp35
1 files changed, 35 insertions, 0 deletions
diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp
index 053cfe333d..c76624139a 100644
--- a/lib/Basic/SourceManager.cpp
+++ b/lib/Basic/SourceManager.cpp
@@ -119,6 +119,41 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag,
Buffer.setInt(true);
#endif
}
+
+ // If the buffer is valid, check to see if it has a UTF Byte Order Mark
+ // (BOM). We only support UTF-8 without a BOM right now. See
+ // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
+ if (!Buffer.getInt()) {
+ llvm::StringRef BufStr = Buffer.getPointer()->getBuffer();
+ const char *BOM = 0;
+ if (BufStr.startswith("\xFE\xBB\xBF"))
+ BOM = "UTF-8";
+ else if (BufStr.startswith("\xFE\xFF"))
+ BOM = "UTF-16 (BE)";
+ else if (BufStr.startswith("\xFF\xFE"))
+ BOM = "UTF-16 (LE)";
+ else if (BufStr.startswith(llvm::StringRef("\x00\x00\xFE\xFF", 4)))
+ BOM = "UTF-32 (BE)";
+ else if (BufStr.startswith(llvm::StringRef("\xFF\xFE\x00\x00", 4)))
+ BOM = "UTF-32 (LE)";
+ else if (BufStr.startswith("\x2B\x2F\x76"))
+ BOM = "UTF-7";
+ else if (BufStr.startswith("\xF7\x64\x4C"))
+ BOM = "UTF-1";
+ else if (BufStr.startswith("\xDD\x73\x66\x73"))
+ BOM = "UTF-EBCDIC";
+ else if (BufStr.startswith("\x0E\xFE\xFF"))
+ BOM = "SDSU";
+ else if (BufStr.startswith("\xFB\xEE\x28"))
+ BOM = "BOCU-1";
+ else if (BufStr.startswith("\x84\x31\x95\x33"))
+ BOM = "BOCU-1";
+
+ if (BOM) {
+ Diag.Report(diag::err_unsupported_bom) << BOM << Entry->getName();
+ Buffer.setInt(1);
+ }
+ }
}
if (Invalid)