summaryrefslogtreecommitdiff
path: root/module/texinfo/string-utils.scm
blob: 42074d334a4a7ed2df28bb30d7d5a87703026c7e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
;;;; (texinfo string-utils) -- text filling and wrapping 
;;;;
;;;;    Copyright (C) 2009, 2013  Free Software Foundation, Inc.
;;;;    Copyright (C) 2003  Richard Todd
;;;; 
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;; 
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;;; Lesser General Public License for more details.
;;;; 
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;; 

;;; Commentary:
;; Module @samp{(texinfo string-utils)} provides various string-related
;; functions useful to Guile's texinfo support.
;;; Code:

(define-module (texinfo string-utils)
  #:use-module (srfi srfi-13)
  #:use-module (srfi srfi-14)
  #:export (escape-special-chars
            transform-string
            expand-tabs
            center-string
            left-justify-string
            right-justify-string
            collapse-repeated-chars
            make-text-wrapper
            fill-string
            string->wrapped-lines))

(define* (transform-string str match? replace #:optional (start #f) (end #f))
"Uses @var{match?} against each character in @var{str}, and performs a
replacement on each character for which matches are found.

@var{match?} may either be a function, a character, a string, or
@code{#t}.  If @var{match?}  is a function, then it takes a single
character as input, and should return @samp{#t} for matches.
@var{match?} is a character, it is compared to each string character
using @code{char=?}.  If @var{match?} is a string, then any character
in that string will be considered a match.  @code{#t} will cause 
every character to be a match.

If @var{replace} is a function, it is called with the matched
character as an argument, and the returned value is sent to the output
string via @samp{display}.  If @var{replace} is anything else, it is
sent through the output string via @samp{display}.

Note that te replacement for the matched characters does not need to
be a single character.  That is what differentiates this function from
@samp{string-map}, and what makes it useful for applications such as
converting @samp{#\\&} to @samp{\"&\"} in web page text.  Some other
functions in this module are just wrappers around common uses of
@samp{transform-string}.  Transformations not possible with this
function should probably be done with regular expressions.

If @var{start} and @var{end} are given, they control which portion
of the string undergoes transformation.  The entire input string
is still output, though.  So, if @var{start} is @samp{5}, then the
first five characters of @var{str} will still appear in the returned
string.

@lisp
; these two are equivalent...
 (transform-string str #\\space #\\-) ; change all spaces to -'s
 (transform-string str (lambda (c) (char=? #\\space c)) #\\-)
@end lisp"
  ;;  I had implemented this with string-fold, but it was
  ;; slower...
  (let* ((os (open-output-string))
         (matcher (cond ((char? match?)
                         (lambda (c) (char=? match? c)))
                        ((procedure? match?)
                         match?)
                        ((string? match?)
                         (lambda (c) (string-index match? c)))
                        ((boolean? match?)
                         (lambda (c) match?))
                        (else (throw 'bad-type "expected #t, char, string, or procedure"))))
         (replacer (if (procedure? replace)
                       (lambda (c) (display (replace c) os))
                       (lambda (c) (display replace os)))))

    ;; put the first part in, un-transformed if they asked for it...
    (if (and start (<= start (string-length str)))
        (display (substring str 0 start) os))

    ;; process the portion they want processed....
    (string-for-each
     (lambda (c)
       (if (matcher c)
           ;; we have a match! replace the char as directed...
           (replacer c)

           ;; not a match, just insert the character itself...
           (write-char c os)))
     str
     (or start 0)
     (or end (string-length str)))

    ;; if there was any at the end, tack it on...
    (if (and end (< end (string-length str)))
        (display (substring str end) os))

    (get-output-string os)))

(define* (expand-tabs str #:optional (tab-size 8))
"Returns a copy of @var{str} with all tabs expanded to spaces.  @var{tab-size} defaults to 8.

Assuming tab size of 8, this is equivalent to: @lisp
 (transform-string str #\\tab \"        \")
@end lisp"
  (transform-string str 
                    #\tab
                    (make-string tab-size #\space)))

(define (escape-special-chars str special-chars escape-char)
"Returns a copy of @var{str} with all given special characters preceded
by the given @var{escape-char}.

@var{special-chars} can either be a single character, or a string consisting
of all the special characters.

@lisp
;; make a string regexp-safe...
 (escape-special-chars \"***(Example String)***\"  
                      \"[]()/*.\" 
                      #\\\\)
=> \"\\\\*\\\\*\\\\*\\\\(Example String\\\\)\\\\*\\\\*\\\\*\"

;; also can escape a singe char...
 (escape-special-chars \"richardt@@vzavenue.net\"
                      #\\@@
                      #\\@@)
=> \"richardt@@@@vzavenue.net\"
@end lisp"
  (transform-string str
                    (if (char? special-chars)
                        ;; if they gave us a char, use char=?
                        (lambda (c) (char=? c special-chars))

                        ;; if they gave us a string, see if our character is in it
                        (lambda (c) (string-index special-chars c)))

                    ;; replace matches with the character preceded by the escape character
                    (lambda (c) (string escape-char c))))

(define* (center-string str #:optional (width 80) (chr #\space) (rchr #f))
"Returns a copy of @var{str} centered in a field of @var{width}
characters.  Any needed padding is done by character @var{chr}, which
defaults to @samp{#\\space}.  If @var{rchr} is provided, then the
padding to the right will use it instead.  See the examples below.
left and @var{rchr} on the right.  The default @var{width} is 80.  The
default @var{chr} and @var{rchr} is @samp{#\\space}.  The string is
never truncated.
@lisp
 (center-string \"Richard Todd\" 24)
=> \"      Richard Todd      \"

 (center-string \" Richard Todd \" 24 #\\=)
=> \"===== Richard Todd =====\"

 (center-string \" Richard Todd \" 24 #\\< #\\>)
=> \"<<<<< Richard Todd >>>>>\"
@end lisp"
  (let* ((len (string-length str))
         (lpad (make-string (max (quotient (- width len) 2) 0) chr))
         ;; right-char == char unless it has been provided by the user
         (right-chr (or rchr chr))
         (rpad (if (char=? right-chr chr)
                   lpad
                   (make-string (max (quotient (- width len) 2) 0) right-chr))))
    (if (>= len width)
        str
        (string-append lpad str rpad (if (odd? (- width len)) (string right-chr) "")))))

(define* (left-justify-string str #:optional (width 80) (chr #\space))
"@code{left-justify-string str [width chr]}.  
Returns a copy of @var{str} padded with @var{chr} such that it is left
justified in a field of @var{width} characters.  The default
@var{width} is 80.  Unlike @samp{string-pad} from srfi-13, the string
is never truncated."
  (let* ((len (string-length str))
         (pad (make-string (max (- width len) 0) chr)))
    (if (>= len width)
        str
        (string-append str pad))))

(define* (right-justify-string str #:optional (width 80) (chr #\space))
"Returns a copy of @var{str} padded with @var{chr} such that it is
right justified in a field of @var{width} characters.  The default
@var{width} is 80.  The default @var{chr} is @samp{#\\space}.  Unlike
@samp{string-pad} from srfi-13, the string is never truncated."
  (let* ((len (string-length str))
         (pad (make-string (max (- width len) 0) chr)))
    (if (>= len width)
        str
        (string-append pad str))))

 (define* (collapse-repeated-chars str #:optional (chr #\space) (num 1))
"Returns a copy of @var{str} with all repeated instances of 
@var{chr} collapsed down to at most @var{num} instances.
The default value for @var{chr} is @samp{#\\space}, and 
the default value for @var{num} is 1.

@lisp
 (collapse-repeated-chars \"H  e  l  l  o\")
=> \"H e l l o\"
 (collapse-repeated-chars \"H--e--l--l--o\" #\\-)
=> \"H-e-l-l-o\"
 (collapse-repeated-chars \"H-e--l---l----o\" #\\- 2)
=> \"H-e--l--l--o\"
@end lisp"
   ;; define repeat-locator as a stateful match? function which remembers
   ;; the last character it had seen.
   (let ((repeat-locator
          ;; initialize prev-chr to something other than what we're seeking...
          (let ((prev-chr (if (char=? chr #\space) #\A #\space))
                (match-count 0))
            (lambda (c)
              (if (and (char=? c prev-chr)
                       (char=? prev-chr chr))
                  ;; found enough duplicates if the match-count is high enough
                  (begin
                    (set! match-count (+ 1 match-count))
                    (>= match-count num))

                  ;; did not find a duplicate
                  (begin (set! match-count 0) 
                         (set! prev-chr c) 
                         #f))))))

     ;; transform the string with our stateful matcher...
     ;; deleting matches...
     (transform-string str repeat-locator "")))

;; split a text string into segments that have the form...
;;  <ws non-ws>  <ws non-ws> etc..
(define (split-by-single-words str)
  (let ((non-wschars (char-set-complement char-set:whitespace)))
    (let loop ((ans '())
               (index 0))
      (let ((next-non-ws (string-index str non-wschars index)))
        (if next-non-ws
          ;; found non-ws...look for ws following...
          (let ((next-ws (string-index str char-set:whitespace next-non-ws)))
            (if next-ws
                ;; found the ws following...
                (loop (cons (substring str index next-ws) ans)
                      next-ws)
                ;; did not find ws...must be the end...
                (reverse (cons (substring str index) ans))))
          ;; did not find non-ws... only ws at end of the string...
          (reverse ans))))))

(define (end-of-sentence? str)
  "Return #t when STR likely denotes the end of sentence."
  (let ((len (string-length str)))
    (and (> len 1)
         (eqv? #\. (string-ref str (- len 1)))
         (not (eqv? #\. (string-ref str (- len 2)))))))

(define* (make-text-wrapper #:key
                            (line-width 80)
                            (expand-tabs? #t)
                            (tab-width 8)
                            (collapse-whitespace? #t)
                            (subsequent-indent "")
                            (initial-indent "")
                            (break-long-words? #t))
  "Returns a procedure that will split a string into lines according to the
given parameters.

@table @code
@item #:line-width
This is the target length used when deciding where to wrap lines.
Default is 80.

@item #:expand-tabs?
Boolean describing whether tabs in the input should be expanded. Default
is #t.

@item #:tab-width
If tabs are expanded, this will be the number of spaces to which they
expand. Default is 8.

@item #:collapse-whitespace?
Boolean describing whether the whitespace inside the existing text
should be removed or not.  Default is #t.

If text is already well-formatted, and is just being wrapped to fit in a
different width, then set this to @samp{#f}. This way, many common text
conventions (such as two spaces between sentences) can be preserved if
in the original text. If the input text spacing cannot be trusted, then
leave this setting at the default, and all repeated whitespace will be
collapsed down to a single space.

@item #:initial-indent
Defines a string that will be put in front of the first line of wrapped
text. Default is the empty string, ``''.

@item #:subsequent-indent
Defines a string that will be put in front of all lines of wrapped
text, except the first one.  Default is the empty string, ``''.

@item #:break-long-words?
If a single word is too big to fit on a line, this setting tells the
wrapper what to do.  Defaults to #t, which will break up long words.
When set to #f, the line will be allowed, even though it is longer
than the defined @code{#:line-width}.
@end table

The return value is a procedure of one argument, the input string, which
returns a list of strings, where each element of the list is one line."
  (lambda (str)
    ;; replace newlines with spaces
    (set! str (transform-string str (lambda (c) (char=? c #\nl)) #\space))

    ;; expand tabs if they wanted us to...
    (if expand-tabs?
        (set! str (expand-tabs str tab-width)))

    ;; collapse whitespace if they wanted us to...
    (if collapse-whitespace?
        (set! str (collapse-repeated-chars str)))
  
    ;; drop any whitespace from the front...
    (set! str (string-trim str))

    ;; now start breaking the text into lines...
    (let loop ((ans '())
               (words (split-by-single-words str))
               (line initial-indent)
               (count 0))
      (if (null? words)
          ;; out of words? ...done!
          (reverse (if (> count 0)
                       (cons line ans)
                       ans))
        
          ;; not out of words...keep going...
          (let ((length-left (- line-width
                                (string-length line)))
                (next-word (if (= count 0)
                               (string-trim (car words))
                               (car words))))
            (cond 
             ;; does the next entry fit?
             ((<= (string-length next-word)
                  length-left)
              (loop ans
                    (cdr words)
                    (if (and collapse-whitespace?
                             (end-of-sentence? line))
                        ;; Add an extra space after the period.
                        (string-append line " " next-word)
                        (string-append line next-word))
                    (+ count 1)))

             ;; ok, it didn't fit...is there already at least one word on the line?
             ((> count 0)
              ;; try to use it for the next line, then...
              (loop (cons line ans)
                    words
                    subsequent-indent
                    0))
           
             ;; ok, it didn't fit...and it's the first word. 
             ;; were we told to break up long words?
             (break-long-words?
              ;; break the like at the limit, since the user wants us to...
              (loop (cons (string-append line (substring next-word 0 length-left))
                          ans)
                    (cons (substring next-word length-left)
                          (cdr words))
                    subsequent-indent
                    0))

             ;; well, then is it the first word and we *shouldn't* break long words, then...
             (else
              (loop (cons (string-append line next-word)
                          ans)
                    (cdr words)
                    subsequent-indent
                    0))))))))

(define (string->wrapped-lines str . kwargs)
  "@code{string->wrapped-lines str keywds ...}. Wraps the text given in
string @var{str} according to the parameters provided in @var{keywds},
or the default setting if they are not given. Returns a list of strings
representing the formatted lines. Valid keyword arguments are discussed
in @code{make-text-wrapper}."
  ((apply make-text-wrapper kwargs) str))

(define (fill-string str . kwargs)
  "Wraps the text given in string @var{str} according to the parameters
provided in @var{kwargs}, or the default setting if they are not
given.  Returns a single string with the wrapped text.  Valid keyword
arguments are discussed in @code{make-text-wrapper}."
  (string-join (apply string->wrapped-lines str kwargs)
               "\n"
               'infix))