[Cmake-commits] CMake branch, next, updated. v3.0.1-4775-gcd68050
Brad King
brad.king at kitware.com
Wed Aug 6 09:24:10 EDT 2014
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMake".
The branch, next has been updated
via cd680503072b7627fc5f79ceacac62ada9f6d6d2 (commit)
via 5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf (commit)
via ffa373e71114727dd70f1a051414de573debb767 (commit)
from 37ed2fdab4745c0e3d9c139ee6a1938935111327 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://cmake.org/gitweb?p=cmake.git;a=commitdiff;h=cd680503072b7627fc5f79ceacac62ada9f6d6d2
commit cd680503072b7627fc5f79ceacac62ada9f6d6d2
Merge: 37ed2fd 5b30ec2
Author: Brad King <brad.king at kitware.com>
AuthorDate: Wed Aug 6 09:24:09 2014 -0400
Commit: CMake Topic Stage <kwrobot at kitware.com>
CommitDate: Wed Aug 6 09:24:09 2014 -0400
Merge topic 'file-strings-encoding' into next
5b30ec28 file: Add ENCODING option to file(STRINGS) command (#10519)
ffa373e7 file: Refactor internal implementation of file(STRINGS)
http://cmake.org/gitweb?p=cmake.git;a=commitdiff;h=5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf
commit 5b30ec28f9610b0e6d12b017d83fc362b0ef9ecf
Author: Clinton Stimpson <clinton at elemtech.com>
AuthorDate: Mon Aug 4 10:47:22 2014 -0600
Commit: Brad King <brad.king at kitware.com>
CommitDate: Wed Aug 6 09:23:47 2014 -0400
file: Add ENCODING option to file(STRINGS) command (#10519)
Support extraction of UTF-8 strings.
diff --git a/Help/command/file.rst b/Help/command/file.rst
index 58e3a26..dbc4149 100644
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
``REGEX <regex>``
Consider only strings that match the given regular expression.
+``ENCODING <encoding-type>``
+ Consider strings of a given encoding. "UTF-8" is currently supported.
+
For example, the code
.. code-block:: cmake
diff --git a/Help/release/dev/file-strings-encoding.rst b/Help/release/dev/file-strings-encoding.rst
new file mode 100644
index 0000000..9da3e47
--- /dev/null
+++ b/Help/release/dev/file-strings-encoding.rst
@@ -0,0 +1,5 @@
+file-strings-encoding
+---------------------
+
+* The :command:`file(STRINGS)` command gained a new ``ENCODING``
+ option to enable extraction of ``UTF-8`` strings.
diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx
index b99b1c7..1325cec 100644
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
arg_length_minimum,
arg_length_maximum,
arg__maximum,
- arg_regex };
+ arg_regex,
+ arg_encoding };
unsigned int minlen = 0;
unsigned int maxlen = 0;
int limit_input = -1;
@@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
bool have_regex = false;
bool newline_consume = false;
bool hex_conversion_enabled = true;
+ bool utf8_encoding = false;
int arg_mode = arg_none;
for(unsigned int i=3; i < args.size(); ++i)
{
@@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
hex_conversion_enabled = false;
arg_mode = arg_none;
}
+ else if(args[i] == "ENCODING")
+ {
+ arg_mode = arg_encoding;
+ }
else if(arg_mode == arg_limit_input)
{
if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
@@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
have_regex = true;
arg_mode = arg_none;
}
+ else if(arg_mode == arg_encoding)
+ {
+ if(args[i] == "UTF-8")
+ {
+ utf8_encoding = true;
+ }
+ else
+ {
+ cmOStringStream e;
+ e << "STRINGS option ENCODING \""
+ << args[i] << "\" not recognized.";
+ this->SetError(e.str());
+ return false;
+ }
+ arg_mode = arg_none;
+ }
else
{
cmOStringStream e;
@@ -618,6 +640,52 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// c is guaranteed to fit in char by the above if...
current_str += static_cast<char>(c);
}
+ else if(utf8_encoding)
+ {
+ // Check for UTF-8 encoded string (up to 4 octets)
+ static const unsigned char utf8_check_table[3][2] =
+ {
+ {0xE0, 0xC0},
+ {0xF0, 0xE0},
+ {0xF8, 0xF0},
+ };
+
+ // how many octets are there?
+ unsigned int num_utf8_bytes = 0;
+ for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
+ {
+ if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
+ num_utf8_bytes = j+2;
+ }
+
+ // get subsequent octets and check that they are valid
+ for(unsigned int j=0; j<num_utf8_bytes; j++)
+ {
+ if(j != 0)
+ {
+ c = fin.get();
+ if(!fin || (c & 0xC0) != 0x80)
+ {
+ fin.putback(static_cast<char>(c));
+ break;
+ }
+ }
+ current_str += static_cast<char>(c);
+ }
+
+ // if this was an invalid utf8 sequence, discard the data, and put
+ // back subsequent characters
+ if((current_str.length() != num_utf8_bytes))
+ {
+ for(unsigned int j=0; j<current_str.size()-1; j++)
+ {
+ c = current_str[current_str.size() - 1 - j];
+ fin.putback(static_cast<char>(c));
+ }
+ current_str = "";
+ }
+ }
+
if(c == '\n' && !newline_consume)
{
diff --git a/Tests/StringFileTest/CMakeLists.txt b/Tests/StringFileTest/CMakeLists.txt
index 4fa5a86..683f969 100644
--- a/Tests/StringFileTest/CMakeLists.txt
+++ b/Tests/StringFileTest/CMakeLists.txt
@@ -55,6 +55,16 @@ else()
"file(STRINGS) incorrectly read from srec file [${infile_strings}]")
endif()
+#this file has utf-8 content
+file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
+list(LENGTH infile_strings content_len)
+if(content_len MATCHES "3")
+ message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
+else()
+ message(SEND_ERROR
+ "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
+endif()
+
# String test
string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
diff --git a/Tests/StringFileTest/test.utf8 b/Tests/StringFileTest/test.utf8
new file mode 100644
index 0000000..6c29170
--- /dev/null
+++ b/Tests/StringFileTest/test.utf8
@@ -0,0 +1,3 @@
+The value of Ï (pi) is 3.141593
+Line mixed with binary partially matches valid utf8: Ï is à93.1593
+à
\ No newline at end of file
http://cmake.org/gitweb?p=cmake.git;a=commitdiff;h=ffa373e71114727dd70f1a051414de573debb767
commit ffa373e71114727dd70f1a051414de573debb767
Author: Clinton Stimpson <clinton at elemtech.com>
AuthorDate: Mon Aug 4 10:45:13 2014 -0600
Commit: Brad King <brad.king at kitware.com>
CommitDate: Wed Aug 6 09:23:15 2014 -0400
file: Refactor internal implementation of file(STRINGS)
Make room for encoding support.
diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx
index e47365a..b99b1c7 100644
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -596,11 +596,29 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
int output_size = 0;
std::vector<std::string> strings;
std::string s;
- int c;
while((!limit_count || strings.size() < limit_count) &&
(limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) &&
- (c = fin.get(), fin))
+ fin)
{
+ std::string current_str;
+
+ int c = fin.get();
+
+ if(c == '\r')
+ {
+ // Ignore CR character to make output always have UNIX newlines.
+ continue;
+ }
+
+ else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
+ (c == '\n' && newline_consume))
+ {
+ // This is an ASCII character that may be part of a string.
+ // Cast added to avoid compiler warning. Cast is ok because
+ // c is guaranteed to fit in char by the above if...
+ current_str += static_cast<char>(c);
+ }
+
if(c == '\n' && !newline_consume)
{
// The current line has been terminated. Check if the current
@@ -621,26 +639,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// Reset the string to empty.
s = "";
}
- else if(c == '\r')
- {
- // Ignore CR character to make output always have UNIX newlines.
- }
- else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
- (c == '\n' && newline_consume))
+ else if(current_str.empty())
{
- // This is an ASCII character that may be part of a string.
- // Cast added to avoid compiler warning. Cast is ok because
- // c is guaranteed to fit in char by the above if...
- s += static_cast<char>(c);
- }
- else
- {
- // TODO: Support ENCODING option. See issue #10519.
// A non-string character has been found. Check if the current
// string matches the requirements. We require that the length
// be at least one no matter what the user specified.
if(s.length() >= minlen && s.length() >= 1 &&
- (!have_regex || regex.find(s.c_str())))
+ (!have_regex || regex.find(s.c_str())))
{
output_size += static_cast<int>(s.size()) + 1;
if(limit_output >= 0 && output_size >= limit_output)
@@ -654,10 +659,15 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// Reset the string to empty.
s = "";
}
+ else
+ {
+ s += current_str;
+ }
+
- // Terminate a string if the maximum length is reached.
if(maxlen > 0 && s.size() == maxlen)
{
+ // Terminate a string if the maximum length is reached.
if(s.length() >= minlen &&
(!have_regex || regex.find(s.c_str())))
{
-----------------------------------------------------------------------
Summary of changes:
hooks/post-receive
--
CMake
More information about the Cmake-commits
mailing list