summaryrefslogtreecommitdiff
path: root/strings/ctype-uca-scanner_next.inl
blob: 79d25487b429f05caf20594d70ab8e6dd5710fa9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
   Copyright (c) 2009, 2021, MariaDB   

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; version 2
   of the License.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
   MA 02110-1335  USA */


#ifdef SCANNER_NEXT_NCHARS

#define SCANNER_NEXT_RETURN(_w,_n) \
  do { weight_and_nchars_t rc= {_w, _n}; return rc; } while(0)

#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
  do { \
    weight_and_nchars_t rc= { _cnt->weight[0], \
                              _ignorable_nchars + \
                              my_contraction_char_length(_cnt) }; \
     return rc; \
  } while(0)

#else

#define SCANNER_NEXT_RETURN(_w,_n) do { return _w; } while (0)

#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
  do { return _cnt->weight[0]; } while(0)

#endif

static inline
#ifdef SCANNER_NEXT_NCHARS
weight_and_nchars_t
MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner,
                                           size_t nchars)
#else
int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
#endif
{
#ifdef SCANNER_NEXT_NCHARS
  uint ignorable_nchars;
#define LOCAL_MAX_CONTRACTION_LENGTH nchars
#else
#define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION
#endif
  /*
    Check if the weights for the previous character have been
    already fully scanned. If yes, then get the next character and
    initialize wbeg and wlength to its weight string.
  */

  if (scanner->wbeg[0])
  {
    /*
      More weights left from the previous step.
      Return the next weight from the current expansion.
      Return "0" as "nchars". The real nchars was set on a previous
      iteration.
    */
    SCANNER_NEXT_RETURN(*scanner->wbeg++, 0);
  }

#ifdef SCANNER_NEXT_NCHARS
  for (ignorable_nchars= 0 ; ; ignorable_nchars++)
#else
  for ( ; ; )
#endif
  {
    const uint16 *wpage;
    my_wc_t wc[MY_UCA_MAX_CONTRACTION];
    int mblen;

    /* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
    /* Get next ASCII character */
    if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
    {
      wc[0]= scanner->sbeg[0];
      scanner->sbeg+= 1;

#if MY_UCA_COMPILE_CONTRACTIONS
      if (my_uca_needs_context_handling(scanner->level, wc[0]))
      {
        const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, wc,
                                                  LOCAL_MAX_CONTRACTION_LENGTH);
        if (cnt)
          SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
      }
#endif

      scanner->page= 0;
      scanner->code= (int) wc[0];
      scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
      if (scanner->wbeg[0])
        SCANNER_NEXT_RETURN(*scanner->wbeg++, ignorable_nchars + 1);
      continue;
    }
    else
#endif
    /* Get next MB character */
    if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
                                       scanner->send)) <= 0))
    {
      if (scanner->sbeg >= scanner->send)
      {
        /* No more bytes, end of line reached */
        SCANNER_NEXT_RETURN(-1, ignorable_nchars);
      }
      /*
        There are some more bytes left. Non-positive mb_len means that
        we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
      */
      if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send)
      {
        /* For safety purposes don't go beyond the string range. */
        scanner->sbeg= scanner->send;
      }
      /*
        Treat every complete or incomplete mbminlen unit as a weight which is
        greater than weight for any possible normal character.
        0xFFFF is greater than any possible weight in the UCA weight table.
      */
      SCANNER_NEXT_RETURN(0xFFFF, ignorable_nchars + 1);
    }

    scanner->sbeg+= mblen;
    if (wc[0] > scanner->level->maxchar)
    {
      /* Return 0xFFFD as weight for all characters outside BMP */
      scanner->wbeg= nochar;
      SCANNER_NEXT_RETURN(0xFFFD, ignorable_nchars + 1);
    }

#if MY_UCA_COMPILE_CONTRACTIONS
    if (my_uca_needs_context_handling(scanner->level, wc[0]))
    {
      const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, wc,
                                                LOCAL_MAX_CONTRACTION_LENGTH);
      if (cnt)
        SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
    }
#endif

    /* Process single character */
    scanner->page= wc[0] >> 8;
    scanner->code= wc[0] & 0xFF;

    /* If weight page for w[0] does not exist, then calculate algoritmically */
    if (!(wpage= scanner->level->weights[scanner->page]))
      SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner),
                          ignorable_nchars + 1);

    /* Calculate pointer to w[0]'s weight, using page and offset */
    scanner->wbeg= wpage +
                   scanner->code * scanner->level->lengths[scanner->page];
    if (scanner->wbeg[0])
      break;
    /* Skip ignorable character and continue the loop */
  }

  SCANNER_NEXT_RETURN(*scanner->wbeg++, ignorable_nchars + 1);
}

#undef SCANNER_NEXT_NCHARS
#undef SCANNER_NEXT_RETURN
#undef SCANNER_NEXT_RETURN_CONTRACTION
#undef LOCAL_MAX_CONTRACTION_LENGTH