summaryrefslogtreecommitdiff
path: root/scss/selector.py
blob: c187f01c5b1dce2fe09491ae6d2c8a50655b76fb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
from __future__ import print_function

import re
from warnings import warn

from scss.cssdefs import CSS2_PSEUDO_ELEMENTS

# Super dumb little selector parser.

# Yes, yes, this is a regex tokenizer.  The actual meaning of the
# selector doesn't matter; the parts are just important for matching up
# during @extend.

# Selectors have three levels: simple, combinator, comma-delimited.
# Each combinator can only appear once as a delimiter between simple
# selectors, so it can be thought of as a prefix.
# So this:
#     a.b + c, d#e
# parses into two Selectors with these structures:
#     [[' ', 'a', '.b'], ['+', 'c']]
#     [[' ', 'd', '#e']]
# Note that the first simple selector has an implied descendant
# combinator -- i.e., it is a descendant of the root element.
# TODO `*html` is incorrectly parsed as a single selector
# TODO this oughta be touched up for css4 selectors
SELECTOR_TOKENIZER = re.compile(r'''
    # Colons introduce pseudo-selectors, sometimes with parens
    # TODO doesn't handle quoted )
    [:]+ [-\w]+ (?: [(] .+? [)] )?

    # These guys are combinators -- note that a single space counts too
    | \s* [ +>~,] \s*

    # Square brackets are attribute tests
    # TODO: this doesn't handle ] within a string
    | \[ .+? \]

    # Dot and pound start class/id selectors.  Percent starts a Sass
    # extend-target faux selector.
    | [.#%] [-\w]+

    # Percentages are used for @keyframes
    | [-.\d]+ [%]

    # Plain identifiers, or single asterisks, are element names
    | [-\w]+
    | [*]

    # & is the sass replacement token
    | [&]

    # And as a last-ditch effort, just eat up to whitespace
    | (\S+)
''', re.VERBOSE | re.MULTILINE)


# Set of starting squiggles that are known to mean "this is a general
# commutative token", i.e., not an element
BODY_TOKEN_SIGILS = frozenset('#.[:%')


def _is_combinator_subset_of(specific, general, is_first=True):
    """Return whether `specific` matches a non-strict subset of what `general`
    matches.
    """
    if is_first and general == ' ':
        # First selector always has a space to mean "descendent of root", which
        # still holds if any other selector appears above it
        return True

    if specific == general:
        return True

    if specific == '>' and general == ' ':
        return True

    if specific == '+' and general == '~':
        return True

    return False


class SimpleSelector(object):
    """A simple selector, by CSS 2.1 terminology: a combination of element
    name, class selectors, id selectors, and other criteria that all apply to a
    single element.

    Note that CSS 3 considers EACH of those parts to be a "simple selector",
    and calls a group of them a "sequence of simple selectors".  That's a
    terrible class name, so we're going with 2.1 here.

    For lack of a better name, each of the individual parts is merely called a
    "token".

    Note that it's possible to have zero tokens.  This isn't legal CSS, but
    it's perfectly legal Sass, since you might nest blocks like so:

        body > {
            div {
                ...
            }
        }
    """
    def __init__(self, combinator, tokens):
        self.combinator = combinator
        # TODO enforce that only one element name (including *) appears in a
        # selector.  only one pseudo, too.
        # TODO remove duplicates?
        self.tokens = tuple(tokens)

    def __repr__(self):
        return "<%s: %r>" % (type(self).__name__, self.render())

    def __hash__(self):
        return hash((self.combinator, self.tokens))

    def __eq__(self, other):
        if not isinstance(other, SimpleSelector):
            return NotImplemented

        return (
            self.combinator == other.combinator and
            self.tokens == other.tokens)

    @property
    def has_parent_reference(self):
        return '&' in self.tokens or 'self' in self.tokens

    @property
    def has_placeholder(self):
        for token in self.tokens:
            if token.startswith('%'):
                return True
        return False

    def is_superset_of(self, other, soft_combinator=False):
        """Return True iff this selector matches the same elements as `other`,
        and perhaps others.

        That is, ``.foo`` is a superset of ``.foo.bar``, because the latter is
        more specific.

        Set `soft_combinator` true to ignore the specific case of this selector
        having a descendent combinator and `other` having anything else.  This
        is for superset checking for ``@extend``, where a space combinator
        really means "none".
        """
        # Combinators must match, OR be compatible -- space is a superset of >,
        # ~ is a superset of +
        if soft_combinator and self.combinator == ' ':
            combinator_superset = True
        else:
            combinator_superset = (
                self.combinator == other.combinator or
                (self.combinator == ' ' and other.combinator == '>') or
                (self.combinator == '~' and other.combinator == '+'))

        return (
            combinator_superset and
            set(self.tokens) <= set(other.tokens))

    def replace_parent(self, parent_simples):
        """If ``&`` (or the legacy xCSS equivalent ``self``) appears in this
        selector, replace it with the given iterable of parent selectors.

        Returns a tuple of simple selectors.
        """
        assert parent_simples

        ancestors = parent_simples[:-1]
        parent = parent_simples[-1]

        did_replace = False
        new_tokens = []
        for token in self.tokens:
            if not did_replace and token in ('&', 'self'):
                did_replace = True
                new_tokens.extend(parent.tokens)
                if token == 'self':
                    warn(FutureWarning(
                        "The xCSS 'self' selector is deprecated and will be "
                        "removed in 2.0.  Use & instead.  ({0!r})"
                        .format(self)
                    ))
            else:
                new_tokens.append(token)

        if not did_replace:
            # This simple selector doesn't contain a parent reference so just
            # stick it on the end
            return parent_simples + (self,)

        # This simple selector was merged into the direct parent.
        merged_self = type(self)(parent.combinator, new_tokens)
        selector = ancestors + (merged_self,)
        # Our combinator goes on the first ancestor, i.e., substituting "foo
        # bar baz" into "+ &.quux" produces "+ foo bar baz.quux".  This means a
        # potential conflict with the first ancestor's combinator!
        root = selector[0]
        if not _is_combinator_subset_of(self.combinator, root.combinator):
            raise ValueError(
                "Can't sub parent {0!r} into {1!r}: "
                "combinators {2!r} and {3!r} conflict!"
                .format(
                    parent_simples, self, self.combinator, root.combinator))

        root = type(self)(self.combinator, root.tokens)
        selector = (root,) + selector[1:]
        return tuple(selector)

    def merge_into(self, other):
        """Merge two simple selectors together.  This is expected to be the
        selector being injected into `other` -- that is, `other` is the
        selector for a block using ``@extend``, and `self` is a selector being
        extended.

        Element tokens must come first, and pseudo-element tokens must come
        last, and there can only be one of each.  The final selector thus looks
        something like::

            [element] [misc self tokens] [misc other tokens] [pseudo-element]

        This method does not check for duplicate tokens; those are assumed to
        have been removed earlier, during the search for a hinge.
        """
        # TODO it shouldn't be possible to merge two elements or two pseudo
        # elements, /but/ it shouldn't just be a fatal error here -- it
        # shouldn't even be considered a candidate for extending!
        # TODO this is slightly inconsistent with ruby, which treats a trailing
        # set of self tokens like ':before.foo' as a single unit to be stuck at
        # the end.  but that's completely bogus anyway.
        element = []
        middle = []
        pseudo = []
        for token in self.tokens + other.tokens:
            if token in CSS2_PSEUDO_ELEMENTS or token.startswith('::'):
                pseudo.append(token)
            elif token[0] in BODY_TOKEN_SIGILS:
                middle.append(token)
            else:
                element.append(token)
        new_tokens = element + middle + pseudo

        if self.combinator == ' ' or self.combinator == other.combinator:
            combinator = other.combinator
        elif other.combinator == ' ':
            combinator = self.combinator
        else:
            raise ValueError(
                "Don't know how to merge conflicting combinators: "
                "{0!r} and {1!r}"
                .format(self, other))
        return type(self)(combinator, new_tokens)

    def difference(self, other):
        new_tokens = tuple(token for token in self.tokens if token not in set(other.tokens))
        return type(self)(self.combinator, new_tokens)

    def render(self):
        # TODO fail if there are no tokens, or if one is a placeholder?
        rendered = ''.join(self.tokens)
        if self.combinator == ' ':
            return rendered
        elif rendered:
            return self.combinator + ' ' + rendered
        else:
            return self.combinator


class Selector(object):
    """A single CSS selector."""

    def __init__(self, simples):
        """Return a selector containing a sequence of `SimpleSelector`s.

        You probably want to use `parse_many` or `parse_one` instead.
        """
        # TODO enforce uniqueness
        self.simple_selectors = tuple(simples)

    @classmethod
    def parse_many(cls, selector):
        selector = selector.strip()
        ret = []

        pending = dict(
            simples=[],
            combinator=' ',
            tokens=[],
        )

        def promote_simple():
            if pending['tokens']:
                pending['simples'].append(
                    SimpleSelector(pending['combinator'], pending['tokens']))
                pending['combinator'] = ' '
                pending['tokens'] = []

        def promote_selector():
            promote_simple()
            if pending['combinator'] != ' ':
                pending['simples'].append(
                    SimpleSelector(pending['combinator'], []))
                pending['combinator'] = ' '
            if pending['simples']:
                ret.append(cls(pending['simples']))
            pending['simples'] = []

        pos = 0
        while pos < len(selector):
            # TODO i don't think this deals with " + " correctly.  anywhere.
            # TODO this used to turn "1.5%" into empty string; why does error
            # not work?
            m = SELECTOR_TOKENIZER.match(selector, pos)
            if not m:
                # TODO prettify me
                raise SyntaxError("Couldn't parse selector: %r" % (selector,))

            token = m.group(0)
            pos += len(token)

            # Kill any extraneous space, BUT make sure not to turn a lone space
            # into an empty string
            token = token.strip() or ' '

            if token == ',':
                # End current selector
                promote_selector()
            elif token in ' +>~':
                # End current simple selector
                promote_simple()
                pending['combinator'] = token
            else:
                # Add to pending simple selector
                pending['tokens'].append(token)

        # Deal with any remaining pending bits
        promote_selector()

        return ret

    @classmethod
    def parse_one(cls, selector_string):
        selectors = cls.parse_many(selector_string)
        if len(selectors) != 1:
            # TODO better error
            raise ValueError

        return selectors[0]

    def __repr__(self):
        return "<%s: %r>" % (type(self).__name__, self.render())

    def __hash__(self):
        return hash(self.simple_selectors)

    def __eq__(self, other):
        if not isinstance(other, Selector):
            return NotImplemented

        return self.simple_selectors == other.simple_selectors

    @property
    def has_parent_reference(self):
        for simple in self.simple_selectors:
            if simple.has_parent_reference:
                return True
        return False

    @property
    def has_placeholder(self):
        for simple in self.simple_selectors:
            if simple.has_placeholder:
                return True
        return False

    def with_parent(self, parent):
        saw_parent_ref = False

        new_simples = []
        for simple in self.simple_selectors:
            if simple.has_parent_reference:
                new_simples.extend(simple.replace_parent(parent.simple_selectors))
                saw_parent_ref = True
            else:
                new_simples.append(simple)

        if not saw_parent_ref:
            new_simples = parent.simple_selectors + tuple(new_simples)

        return type(self)(new_simples)

    def lookup_key(self):
        """Build a key from the "important" parts of a selector: elements,
        classes, ids.
        """
        parts = set()
        for node in self.simple_selectors:
            for token in node.tokens:
                if token[0] not in ':[':
                    parts.add(token)

        if not parts:
            # Should always have at least ONE key; selectors with no elements,
            # no classes, and no ids can be indexed as None to avoid a scan of
            # every selector in the entire document
            parts.add(None)

        return frozenset(parts)

    def is_superset_of(self, other):
        assert isinstance(other, Selector)

        idx = 0
        for other_node in other.simple_selectors:
            if idx >= len(self.simple_selectors):
                return False

            while idx < len(self.simple_selectors):
                node = self.simple_selectors[idx]
                idx += 1

                if node.is_superset_of(other_node):
                    break

        return True

    def substitute(self, target, replacement):
        """Return a list of selectors obtained by replacing the `target`
        selector with `replacement`.

        Herein lie the guts of the Sass @extend directive.

        In general, for a selector ``a X b Y c``, a target ``X Y``, and a
        replacement ``q Z``, return the selectors ``a q X b Z c`` and ``q a X b
        Z c``.  Note in particular that no more than two selectors will be
        returned, and the permutation of ancestors will never insert new simple
        selectors "inside" the target selector.
        """
        # Find the target in the parent selector, and split it into
        # before/after
        p_before, p_extras, p_after = self.break_around(target.simple_selectors)

        # The replacement has no hinge; it only has the most specific simple
        # selector (which is the part that replaces "self" in the parent) and
        # whatever preceding simple selectors there may be
        r_trail = replacement.simple_selectors[:-1]
        r_extras = replacement.simple_selectors[-1]

        # TODO what if the prefix doesn't match?  who wins?  should we even get
        # this far?
        focal_nodes = (p_extras.merge_into(r_extras),)

        befores = _merge_selectors(p_before, r_trail)

        cls = type(self)
        return [
            cls(before + focal_nodes + p_after)
            for before in befores]

    def break_around(self, hinge):
        """Given a simple selector node contained within this one (a "hinge"),
        break it in half and return a parent selector, extra specifiers for the
        hinge, and a child selector.

        That is, given a hinge X, break the selector A + X.y B into A, + .y,
        and B.
        """
        hinge_start = hinge[0]
        for i, node in enumerate(self.simple_selectors):
            # In this particular case, a ' ' combinator actually means "no" (or
            # any) combinator, so it should be ignored
            if hinge_start.is_superset_of(node, soft_combinator=True):
                start_idx = i
                break
        else:
            raise ValueError(
                "Couldn't find hinge %r in compound selector %r" %
                (hinge_start, self.simple_selectors))

        for i, hinge_node in enumerate(hinge):
            if i == 0:
                # We just did this
                continue

            self_node = self.simple_selectors[start_idx + i]
            if hinge_node.is_superset_of(self_node):
                continue

            # TODO this isn't true; consider finding `a b` in `a c a b`
            raise ValueError(
                "Couldn't find hinge %r in compound selector %r" %
                (hinge_node, self.simple_selectors))

        end_idx = start_idx + len(hinge) - 1

        focal_node = self.simple_selectors[end_idx]
        extras = focal_node.difference(hinge[-1])

        return (
            self.simple_selectors[:start_idx],
            extras,
            self.simple_selectors[end_idx + 1:])

    def render(self):
        return ' '.join(simple.render() for simple in self.simple_selectors)


def _merge_selectors(left, right):
    """Given two selector chains (lists of simple selectors), return a list of
    selector chains representing elements matched by both of them.

    This operation is not exact, and involves some degree of fudging -- the
    wackier and more divergent the input, the more fudging.  It's meant to be
    what a human might expect rather than a precise covering of all possible
    cases.  Most notably, when the two input chains have absolutely nothing in
    common, the output is merely ``left + right`` and ``right + left`` rather
    than all possible interleavings.
    """

    if not left or not right:
        # At least one is empty, so there are no conflicts; just return
        # whichever isn't empty.  Remember to return a LIST, though
        return [left or right]

    lcs = longest_common_subsequence(left, right, _merge_simple_selectors)

    ret = [()]  # start with a dummy empty chain or weaving won't work

    left_last = 0
    right_last = 0
    for left_next, right_next, merged in lcs:
        ret = _weave_conflicting_selectors(
            ret,
            left[left_last:left_next],
            right[right_last:right_next],
            (merged,))

        left_last = left_next + 1
        right_last = right_next + 1

    ret = _weave_conflicting_selectors(
        ret,
        left[left_last:],
        right[right_last:])

    return ret


def _weave_conflicting_selectors(prefixes, a, b, suffix=()):
    """Part of the selector merge algorithm above.  Not useful on its own.  Pay
    no attention to the man behind the curtain.
    """
    # OK, what this actually does: given a list of selector chains, two
    # "conflicting" selector chains, and an optional suffix, return a new list
    # of chains like this:
    #   prefix[0] + a + b + suffix,
    #   prefix[0] + b + a + suffix,
    #   prefix[1] + a + b + suffix,
    #   ...
    # In other words, this just appends a new chain to each of a list of given
    # chains, except that the new chain might be the superposition of two
    # other incompatible chains.
    both = a and b
    for prefix in prefixes:
        yield prefix + a + b + suffix
        if both:
            # Only use both orderings if there's an actual conflict!
            yield prefix + b + a + suffix


def _merge_simple_selectors(a, b):
    """Merge two simple selectors, for the purposes of the LCS algorithm below.

    In practice this returns the more specific selector if one is a subset of
    the other, else it returns None.
    """
    # TODO what about combinators
    if a.is_superset_of(b):
        return b
    elif b.is_superset_of(a):
        return a
    else:
        return None


def longest_common_subsequence(a, b, mergefunc=None):
    """Find the longest common subsequence between two iterables.

    The longest common subsequence is the core of any diff algorithm: it's the
    longest sequence of elements that appears in both parent sequences in the
    same order, but NOT necessarily consecutively.

    Original algorithm borrowed from Wikipedia:
    http://en.wikipedia.org/wiki/Longest_common_subsequence_problem#Code_for_the_dynamic_programming_solution

    This function is used only to implement @extend, largely because that's
    what the Ruby implementation does.  Thus it's been extended slightly from
    the simple diff-friendly algorithm given above.

    What @extend wants to know is whether two simple selectors are compatible,
    not just equal.  To that end, you must pass in a "merge" function to
    compare a pair of elements manually.  It should return `None` if they are
    incompatible, and a MERGED element if they are compatible -- in the case of
    selectors, this is whichever one is more specific.

    Because of this fuzzier notion of equality, the return value is a list of
    ``(a_index, b_index, value)`` tuples rather than items alone.
    """
    if mergefunc is None:
        # Stupid default, just in case
        def mergefunc(a, b):
            if a == b:
                return a
            return None

    # Precalculate equality, since it can be a tad expensive and every pair is
    # compared at least once
    eq = {}
    for ai, aval in enumerate(a):
        for bi, bval in enumerate(b):
            eq[ai, bi] = mergefunc(aval, bval)

    # Build the "length" matrix, which provides the length of the LCS for
    # arbitrary-length prefixes.  -1 exists only to support the base case
    prefix_lcs_length = {}
    for ai in range(-1, len(a)):
        for bi in range(-1, len(b)):
            if ai == -1 or bi == -1:
                l = 0
            elif eq[ai, bi]:
                l = prefix_lcs_length[ai - 1, bi - 1] + 1
            else:
                l = max(
                    prefix_lcs_length[ai, bi - 1],
                    prefix_lcs_length[ai - 1, bi])

            prefix_lcs_length[ai, bi] = l

    # The interesting part.  The key insight is that the bottom-right value in
    # the length matrix must be the length of the LCS because of how the matrix
    # is defined, so all that's left to do is backtrack from the ends of both
    # sequences in whatever way keeps the LCS as long as possible, and keep
    # track of the equal pairs of elements we see along the way.
    # Wikipedia does this with recursion, but the algorithm is trivial to
    # rewrite as a loop, as below.
    ai = len(a) - 1
    bi = len(b) - 1

    ret = []
    while ai >= 0 and bi >= 0:
        merged = eq[ai, bi]
        if merged is not None:
            ret.append((ai, bi, merged))
            ai -= 1
            bi -= 1
        elif prefix_lcs_length[ai, bi - 1] > prefix_lcs_length[ai - 1, bi]:
            bi -= 1
        else:
            ai -= 1

    # ret has the latest items first, which is backwards
    ret.reverse()
    return ret