[Cmake-commits] CMake branch, next, updated. v3.0.0-4727-g4c83a4e
Clinton Stimpson
clinton at elemtech.com
Mon Aug 4 10:26:35 EDT 2014
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMake".
The branch, next has been updated
via 4c83a4e74d4e13307504ff681269776ef0878246 (commit)
via 8157cb2ea2247ffb5aef206e149da17dfad9a6a7 (commit)
from af5b8256b4d3ca1b65f5f97424918e241e360213 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://cmake.org/gitweb?p=cmake.git;a=commitdiff;h=4c83a4e74d4e13307504ff681269776ef0878246
commit 4c83a4e74d4e13307504ff681269776ef0878246
Merge: af5b825 8157cb2
Author: Clinton Stimpson <clinton at elemtech.com>
AuthorDate: Mon Aug 4 10:26:34 2014 -0400
Commit: CMake Topic Stage <kwrobot at kitware.com>
CommitDate: Mon Aug 4 10:26:34 2014 -0400
Merge topic 'file-strings-encoding' into next
8157cb2e cmFileCommand: Add ENCODING option to file(STRINGS ...)
http://cmake.org/gitweb?p=cmake.git;a=commitdiff;h=8157cb2ea2247ffb5aef206e149da17dfad9a6a7
commit 8157cb2ea2247ffb5aef206e149da17dfad9a6a7
Author: Clinton Stimpson <clinton at elemtech.com>
AuthorDate: Mon Aug 4 08:20:19 2014 -0600
Commit: Clinton Stimpson <clinton at elemtech.com>
CommitDate: Mon Aug 4 08:21:42 2014 -0600
cmFileCommand: Add ENCODING option to file(STRINGS ...)
For now, UTF8 encoding support is added.
This addresses issue #10519.
diff --git a/Help/command/file.rst b/Help/command/file.rst
index 58e3a26..5f93686 100644
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
``REGEX <regex>``
Consider only strings that match the given regular expression.
+``ENCODING <encoding-type>``
+ Consider strings of a given encoding. "UTF8" is currently supported.
+
For example, the code
.. code-block:: cmake
diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx
index e47365a..34c0d73 100644
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
arg_length_minimum,
arg_length_maximum,
arg__maximum,
- arg_regex };
+ arg_regex,
+ arg_encoding };
unsigned int minlen = 0;
unsigned int maxlen = 0;
int limit_input = -1;
@@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
bool have_regex = false;
bool newline_consume = false;
bool hex_conversion_enabled = true;
+ bool utf8_encoding = false;
int arg_mode = arg_none;
for(unsigned int i=3; i < args.size(); ++i)
{
@@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
hex_conversion_enabled = false;
arg_mode = arg_none;
}
+ else if(args[i] == "ENCODING")
+ {
+ arg_mode = arg_encoding;
+ }
else if(arg_mode == arg_limit_input)
{
if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
@@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
have_regex = true;
arg_mode = arg_none;
}
+ else if(arg_mode == arg_encoding)
+ {
+ if(args[i] == "UTF8")
+ {
+ utf8_encoding = true;
+ }
+ else
+ {
+ cmOStringStream e;
+ e << "STRINGS option ENCODING \""
+ << args[i] << "\" not recognized.";
+ this->SetError(e.str());
+ return false;
+ }
+ arg_mode = arg_none;
+ }
else
{
cmOStringStream e;
@@ -596,11 +618,75 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
int output_size = 0;
std::vector<std::string> strings;
std::string s;
- int c;
while((!limit_count || strings.size() < limit_count) &&
(limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) &&
- (c = fin.get(), fin))
+ fin)
{
+ std::string current_str;
+
+ int c = fin.get();
+
+ if(c == '\r')
+ {
+ // Ignore CR character to make output always have UNIX newlines.
+ continue;
+ }
+
+ else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
+ (c == '\n' && newline_consume))
+ {
+ // This is an ASCII character that may be part of a string.
+ // Cast added to avoid compiler warning. Cast is ok because
+ // c is guaranteed to fit in char by the above if...
+ current_str += static_cast<char>(c);
+ }
+ else if(utf8_encoding)
+ {
+ // Check for UTF-8 encoded string (up to 4 octets)
+ static const unsigned char utf8_check_table[3][2] =
+ {
+ {0xE0, 0xC0},
+ {0xF0, 0xE0},
+ {0xF8, 0xF0},
+ };
+
+ // how many octets are there?
+ unsigned int num_utf8_bytes = 0;
+ for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
+ {
+ if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
+ num_utf8_bytes = j+2;
+ }
+
+ // get subsequent octets and check that they are valid
+ for(unsigned int j=0; j<num_utf8_bytes; j++)
+ {
+ if(j != 0)
+ {
+ c = fin.get();
+ if(!fin || (c & 0xC0) != 0x80)
+ {
+ fin.putback(c);
+ break;
+ }
+ }
+ current_str += static_cast<char>(c);
+ }
+
+ // if this was an invalid utf8 sequence, discard the data, and put
+ // back subsequent characters
+ if((current_str.length() != num_utf8_bytes))
+ {
+ for(unsigned int j=0; j<current_str.size()-1; j++)
+ {
+ c = current_str[current_str.size() - 1 - j];
+ fin.putback(c);
+ }
+ current_str.clear();
+ }
+ }
+
+
if(c == '\n' && !newline_consume)
{
// The current line has been terminated. Check if the current
@@ -621,26 +707,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// Reset the string to empty.
s = "";
}
- else if(c == '\r')
- {
- // Ignore CR character to make output always have UNIX newlines.
- }
- else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
- (c == '\n' && newline_consume))
+ else if(current_str.empty())
{
- // This is an ASCII character that may be part of a string.
- // Cast added to avoid compiler warning. Cast is ok because
- // c is guaranteed to fit in char by the above if...
- s += static_cast<char>(c);
- }
- else
- {
- // TODO: Support ENCODING option. See issue #10519.
// A non-string character has been found. Check if the current
// string matches the requirements. We require that the length
// be at least one no matter what the user specified.
if(s.length() >= minlen && s.length() >= 1 &&
- (!have_regex || regex.find(s.c_str())))
+ (!have_regex || regex.find(s.c_str())))
{
output_size += static_cast<int>(s.size()) + 1;
if(limit_output >= 0 && output_size >= limit_output)
@@ -654,10 +727,15 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// Reset the string to empty.
s = "";
}
+ else
+ {
+ s += current_str;
+ }
+
- // Terminate a string if the maximum length is reached.
if(maxlen > 0 && s.size() == maxlen)
{
+ // Terminate a string if the maximum length is reached.
if(s.length() >= minlen &&
(!have_regex || regex.find(s.c_str())))
{
diff --git a/Tests/StringFileTest/CMakeLists.txt b/Tests/StringFileTest/CMakeLists.txt
index 4fa5a86..c46bef6 100644
--- a/Tests/StringFileTest/CMakeLists.txt
+++ b/Tests/StringFileTest/CMakeLists.txt
@@ -55,6 +55,16 @@ else()
"file(STRINGS) incorrectly read from srec file [${infile_strings}]")
endif()
+#this file has utf-8 content
+FILE(STRINGS test.utf8 infile_strings ENCODING UTF8)
+list(LENGTH infile_strings content_len)
+if(content_len MATCHES "3")
+ message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
+else()
+ message(SEND_ERROR
+ "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
+endif()
+
# String test
string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
diff --git a/Tests/StringFileTest/test.utf8 b/Tests/StringFileTest/test.utf8
new file mode 100644
index 0000000..6c29170
--- /dev/null
+++ b/Tests/StringFileTest/test.utf8
@@ -0,0 +1,3 @@
+The value of Ï (pi) is 3.141593
+Line mixed with binary partially matches valid utf8: Ï is à93.1593
+à
\ No newline at end of file
-----------------------------------------------------------------------
Summary of changes:
Help/command/file.rst | 3 +
Source/cmFileCommand.cxx | 116 +++++++++++++++++++++++++++++------
Tests/StringFileTest/CMakeLists.txt | 10 +++
Tests/StringFileTest/test.utf8 | 3 +
4 files changed, 113 insertions(+), 19 deletions(-)
create mode 100644 Tests/StringFileTest/test.utf8
hooks/post-receive
--
CMake
More information about the Cmake-commits
mailing list