summaryrefslogtreecommitdiff
path: root/splitter/splitter.pl
diff options
context:
space:
mode:
Diffstat (limited to 'splitter/splitter.pl')
-rwxr-xr-xsplitter/splitter.pl344
1 files changed, 344 insertions, 0 deletions
diff --git a/splitter/splitter.pl b/splitter/splitter.pl
new file mode 100755
index 0000000..67b1f08
--- /dev/null
+++ b/splitter/splitter.pl
@@ -0,0 +1,344 @@
+#!/usr/bin/perl
+#
+#***************************************************************************************************
+# SENTENCE SPLITTER
+# Author: Paul Clough {cloughie@dcs.shef.ac.uk}
+#
+
+# This program is originally based on the sentence splitter program
+# published by Paul Clough. Version 1.0, available form
+# http://ir.shef.ac.uk/cloughie/software.html (splitter.zip)
+# The original program without a license.
+
+# Modifications to the original by Daniel M German and Y. Manabe,
+# which are under the following license:
+#
+# This patch is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This patch is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this patch. If not, see <http://www.gnu.org/licenses/>.
+
+
+
+use strict;
+
+# This program is originally based on the sentence splitter program
+# published by Paul Clough. Version 1.0, but then it was mostly rewritten
+# His ideas, however, linger in here (and his dictionary of abbreviations)
+
+
+my $dictionary = "splitter.dict";
+my $abbrv_file = "splitter.abv";
+my $len = 0;
+my %COMMON_TERMS = ();
+my %ABBREVIATIONS = ();
+my $output_file = $ARGV[0];
+
+# where are we running the splitter from?
+my $path = $0;
+$path =~ s/[^\/]+$//;
+if ($path eq "") {
+ $path = "./";
+}
+$dictionary = $path . $dictionary;
+$abbrv_file = $path . $abbrv_file;
+
+die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/;
+
+
+die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/;
+
+open(OUT, ">$output_file") or die("Unable to create output file [$output_file]");
+
+
+#print length($opt_o);
+
+# Load in the dictionary and find the common words.
+# Here, we assume the words in upper case are simply names and one
+# word per line - i.e. in same form as /usr/dict/words
+&loadDictionary;
+
+# Same assumptions as for dictionary
+&loadAbbreviations;
+
+my $text;
+# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
+
+my $line;
+while (defined ($line = <>)) {
+ $text.= $line;
+}
+
+# append a newline just in case
+$text.="\n";
+
+# - is used to create lines
+# = is used to create lines
+$text =~ s@\+?\-{3,1000}\+?@ @gmx;
+$text =~ s@={3,1000}@ @gmx;
+$text =~ s@:{3,1000}@ @gmx;
+$text =~ s@\*{3,1000}@ @gmx;
+
+# some characters are used for prettyprinting but never appear in sentences
+$text =~ s@\|+@ @gmx;
+$text =~ s@\\+@ @gmx;
+
+# let us deal with /* before we do anything
+$text =~ s@^[ \t]*/\*@@gmx;
+$text =~ s/\*\/[ \t]*$//gmx;
+$text =~ s@([^:])// @$1@gmx;
+
+
+# Replace /\r\n/ with \n only
+$text =~ s/\r\n/\n/g;
+
+
+# now, try to replace the leading/ending character of each line #/-, at most 3 heading characters
+# and each repeated as many times as necessaary
+$text =~ s/^[ \t]{0,3}[\*\#\/\;]+//gmx;
+$text =~ s/^[ \t]{0,3}[\-]+//gmx;
+
+$text =~ s/[\*\#\/]+[ \t]{0,3}$//gmx;
+$text =~ s/[\-]+[ \t]{0,3}$//gmx;
+
+# now, try to replace the ending character of each line if it is * or #
+$text =~ s/[\*\#]+//gmx;
+
+# at this point we have lines with nothing but spaces, let us get rid of them
+$text =~ s/^[ \t]+$/\n/gm;
+
+# let us try the following trick
+# We first get rid of \t and replace it with ' '
+# we then use \t as a "single line separator" and \n as multiple line.
+# so we can match each with a single character.
+$text =~ tr/\t/ /;
+
+$text =~ s/\n(?!\n)/\t/g;
+$text =~ s/\n\n+/\n/g;
+$text .= "\n";
+
+while ($text =~ /^([^\n]*)\n/gsm ) {
+ my $curr = $1;
+# print "<<$curr\n<<\n";
+
+ # let us count the number of alphabetic chars to check if we are skipping anything we should not
+ my $count = 0;
+ for my $i (0..length($curr)-1) {
+ my $c = substr($curr,$i,1);
+ $count++ if ($c ge 'A' && $c le 'z');
+ }
+
+ my @sentences = Split_Text($curr);
+
+ my $count2 = 0;
+
+ foreach my $s (@sentences) {
+ for my $i (0..length($s)-1) {
+ my $c = substr($s,$i,1);
+ $count2++ if ($c ge 'A' && $c le 'z');
+ }
+ print OUT Clean_Sentence($s) , "\n";
+ }
+ if ($count != $count2) {
+ print STDERR "-------------------------------------\n";
+ print STDERR "[$curr]\n";
+ foreach my $s (@sentences) {
+ print STDERR Clean_Sentence($s) , "\n";
+ }
+ die "Number of printable chars does not match! [$count][$count2]";
+ }
+}
+close OUT;
+#print "$text\n";
+
+exit;
+
+
+
+
+#***************************************************************************************************
+
+#***************************************************************************************************
+# procedures
+#***************************************************************************************************
+
+sub Clean_Sentence
+{
+ ($_) = @_;
+ # check for trailing bullets of different types
+
+ s/^o //;
+ s/^\s*[0-9]+\s*[\-\)]//;
+ s/^[ \t]+//;
+ s/[ \t]+$//;
+ # remove a trailing -
+ s/^[ \t]*[\-\.\s*] +//;
+
+ # replace quotes
+ s/\s+/ /g;
+
+ s/['"`]+/<quotes>/g;
+
+
+ s/:/<colon>/g;
+
+ s/\.+$/./;
+
+ die if /\n/m;
+
+ return $_;
+
+}
+
+
+sub Split_Text
+{
+ my ($text) = @_;
+ my $len = 0;
+ my $next_word;
+ my $last_word;
+ my $stuff_after_period;
+ my $puctuation;
+ my @result;
+ my $after;
+ my $currentSentence = "";
+ # this breaks the sentence into
+ # 1. Any text before a separator
+ # 2. The separator [.!?:\n]
+ # 3.
+ while ($text =~ /^
+ ([^\.\!\?\:\n]*) #
+ ([\.\!\?\:\n])
+ (?=(.?))
+ /xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) {
+ $text = $'; #';
+ my $sentenceMatch = $1;
+ my $sentence = $1 . $2;
+ my $punctuation = $2;
+ $after = $3;
+
+ # if next character is not a space, then we are not in a sentence"
+ if ($after ne " " && $after ne "\t") {
+ $currentSentence .= $sentence;
+ next;
+ }
+ #at this point we know that there is a space after
+ if ($punctuation eq ":" || $punctuation eq "?" || $punctuation eq "!") {
+ # let us consider this right here a beginning of a sentence
+ push @result, $currentSentence . $sentence;
+ $currentSentence = "";
+ next;
+ }
+ if ($punctuation eq ".") {
+ # we have a bunch of alternatives
+ # for the time being just consider a new sentence
+
+ # TODO
+ # simple heuristic... let us check that the next words are not the beginning of a sentence
+ # in our library
+ # ENDTODO
+
+ # is the last word an abbreviation? For this the period has to follow the word
+ # this expression might have to be updated to take care of special characters in names :(
+ if ($sentenceMatch =~ /(.?)([^[:punct:]\s]+)$/) {
+ my $before = $1;
+ my $lastWord = $2;
+ #is it an abbreviation
+
+ if (length($lastWord) == 1 ) {
+ # single character abbreviations are special...
+ # we will assume they never split the sentence if they are capitalized.
+ if (($lastWord ge "A") and
+ ($lastWord le "Z")) {
+ $currentSentence .= $sentence;
+ next;
+ }
+ print "last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n";
+
+ # but some are lowercase!
+ if (($lastWord eq "e") or
+ ($lastWord eq "i")) {
+ $currentSentence .= $sentence;
+ next;
+ }
+ print "2 last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n";
+ } else {
+
+ $lastWord = lc $lastWord;
+
+ # only accept abbreviations if the previous char to the abbrev is space or
+ # is empty (beginning of line). This avoids things like .c
+ if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$lastWord}) {
+
+ $currentSentence .= $sentence;
+ next;
+ } else {
+ # just keep going, we handle this case below
+ }
+ }
+
+ }
+
+ push @result, $currentSentence . $sentence;
+ $currentSentence = "";
+ next;
+ }
+ die "We have not dealt with this case";
+ }
+ push @result, $currentSentence . $text;
+
+ #Print_Non_Sentence($text,"\n","");
+ return @result;
+
+}
+
+sub loadDictionary {
+
+ # Initialise var
+ my $common_term = "";
+
+ if (open(DICT, $dictionary)) {
+
+ while (defined ($line = <DICT>)) {
+ chomp($line);
+ if ($line !~ /^[A-Z]/) {
+ $COMMON_TERMS{$line} = 1;
+ }
+
+ }
+
+ close(DICT);
+ } else {
+ die "cannot open dictionary file $dictionary: $!";
+ }
+}
+
+sub loadAbbreviations
+{
+
+ # Initialise var
+ my $abbrv_term = "";
+
+ if (open(ABBRV, $abbrv_file)) {
+
+ while (defined ($line = <ABBRV>)) {
+ chomp($line);
+ $ABBREVIATIONS{$line} = $line;
+ }
+
+ close(ABBRV);
+ } else {
+ die "cannot open dictionary file $abbrv_file: $!";
+ }
+}
+
+
+#***************************************************************************************************