summaryrefslogtreecommitdiff
path: root/gettext-tools/src/sentence.c
diff options
context:
space:
mode:
Diffstat (limited to 'gettext-tools/src/sentence.c')
-rw-r--r--gettext-tools/src/sentence.c194
1 files changed, 194 insertions, 0 deletions
diff --git a/gettext-tools/src/sentence.c b/gettext-tools/src/sentence.c
new file mode 100644
index 0000000..0a4883e
--- /dev/null
+++ b/gettext-tools/src/sentence.c
@@ -0,0 +1,194 @@
+/* Sentence handling.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ Written by Daiki Ueno <ueno@gnu.org>, 2015.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+/* Specification. */
+#include "sentence.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "unistr.h"
+
+
+/* The minimal number of white spaces which should follow after the
+ end of sentence. */
+int sentence_end_required_spaces = 1;
+
+/* This function works in a similar way to 'forward-sentence' in
+ Emacs, which basically does a regular expression matching of:
+
+ [.?!\u2026]
+ []"'\u201d)}]*
+ \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\)
+
+ Since we are lacking a regular expression routine capable of
+ Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent
+ version, we would rather avoid depending on it), apply a manually
+ constructed DFA, which consists of 8 states where 4 of them are a
+ terminal. */
+const char *
+sentence_end (const char *string, ucs4_t *ending_charp)
+{
+ const char *str = string;
+ const char *str_limit = string + strlen (str);
+ /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal. */
+ int state = 0;
+ /* Previous character before an end marker. */
+ ucs4_t ending_char = 0xfffd;
+ /* Possible starting position of the match, and the next starting
+ position if the current match fails. */
+ const char *match_start = NULL, *match_next = NULL;
+ /* Number of spaces. */
+ int spaces = 0;
+
+ while (str <= str_limit)
+ {
+ ucs4_t uc;
+ size_t length;
+
+ length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
+
+ if (state == 0)
+ {
+ switch (uc)
+ {
+ case '.': case '?': case '!': case 0x2026:
+ state = 1;
+ match_start = str;
+ match_next = str + length;
+ ending_char = uc;
+ spaces = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ str += length;
+ continue;
+ }
+
+ if (state == 1)
+ {
+ switch (uc)
+ {
+ case ']': case '"': case '\'': case ')': case '}': case 0x201d:
+ state = 2;
+ break;
+
+ case '\0': case '\n':
+ /* State 3. */
+ *ending_charp = ending_char;
+ return match_start;
+
+ case ' ': case 0x00a0:
+ if (++spaces == sentence_end_required_spaces)
+ {
+ /* State 7. */
+ *ending_charp = ending_char;
+ return match_start;
+ }
+ state = 4;
+ break;
+
+ case '\t':
+ /* State 5. */
+ *ending_charp = ending_char;
+ return match_start;
+
+ default:
+ str = match_next;
+ state = 0;
+ continue;
+ }
+
+ str += length;
+ continue;
+ }
+
+ if (state == 2)
+ {
+ switch (uc)
+ {
+ case ']': case '"': case '\'': case ')': case '}': case 0x201d:
+ break;
+
+ case '\0': case '\n':
+ /* State 3. */
+ *ending_charp = ending_char;
+ return match_start;
+
+ case ' ': case 0x00a0:
+ if (++spaces == sentence_end_required_spaces)
+ {
+ /* State 7. */
+ *ending_charp = ending_char;
+ return match_start;
+ }
+ state = 4;
+ break;
+
+ case '\t':
+ /* State 5. */
+ *ending_charp = ending_char;
+ return match_start;
+
+ default:
+ state = 0;
+ str = match_next;
+ continue;
+ }
+
+ str += length;
+ continue;
+ }
+
+ if (state == 4)
+ {
+ switch (uc)
+ {
+ case '\0': case '\n':
+ /* State 6. */
+ *ending_charp = ending_char;
+ return match_start;
+
+ case ' ': case 0x00a0:
+ if (++spaces == sentence_end_required_spaces)
+ {
+ /* State 7. */
+ *ending_charp = ending_char;
+ return match_start;
+ }
+ break;
+
+ default:
+ state = 0;
+ str = match_next;
+ continue;
+ }
+
+ str += length;
+ continue;
+ }
+ }
+
+ *ending_charp = 0xfffd;
+ return str_limit;
+}