summaryrefslogtreecommitdiff
path: root/app/assets/javascripts/content_editor/services/hast_to_prosemirror_converter.js
blob: 28a50adca6b944adf243e565904ba11edb0c47db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
/**
 * This module implements a function that converts a Hast Abstract
 * Syntax Tree (AST) to a ProseMirror document.
 *
 * It is based on the prosemirror-markdown’s from_markdown module
 * https://github.com/ProseMirror/prosemirror-markdown/blob/master/src/from_markdown.js.
 *
 * It deviates significantly from the original because
 * prosemirror-markdown supports converting an markdown-it AST instead of a
 * HAST one. It also adds sourcemap attributes automatically to every
 * ProseMirror node and mark created during the conversion process.
 *
 * We recommend becoming familiar with HAST and ProseMirror documents to
 * facilitate the understanding of the behavior implemented in this module.
 *
 * Unist syntax tree documentation: https://github.com/syntax-tree/unist
 * Hast tree documentation: https://github.com/syntax-tree/hast
 * ProseMirror document documentation: https://prosemirror.net/docs/ref/#model.Document_Structure
 * visit-parents documentation: https://github.com/syntax-tree/unist-util-visit-parents
 */

import { Mark } from 'prosemirror-model';
import { visitParents, SKIP } from 'unist-util-visit-parents';
import { isFunction, isString, noop, mapValues } from 'lodash';

const NO_ATTRIBUTES = {};

/**
 * Merges two ProseMirror text nodes if both text nodes
 * have the same set of marks.
 *
 * @param {ProseMirror.Node} a first ProseMirror node
 * @param {ProseMirror.Node} b second ProseMirror node
 * @returns {model.Node} A new text node that results from combining
 * the text of the two text node parameters or null.
 */
function maybeMerge(a, b) {
  if (a && a.isText && b && b.isText && Mark.sameSet(a.marks, b.marks)) {
    return a.withText(a.text + b.text);
  }

  return null;
}

/**
 * Creates an object that contains sourcemap position information
 * included in a Hast Abstract Syntax Tree. The Content
 * Editor uses the sourcemap information to restore the
 * original source of a node when the user doesn’t change it.
 *
 * Unist syntax tree documentation: https://github.com/syntax-tree/unist
 * Hast node documentation: https://github.com/syntax-tree/hast
 *
 * @param {HastNode} hastNode A Hast node
 * @param {String} markdown Markdown source file
 *
 * @returns It returns an object with the following attributes:
 *
 * - sourceMapKey: A string that uniquely identifies what is
 * the position of the hast node in the Markdown source file.
 * - sourceMarkdown: A node’s original Markdown source extrated
 * from the Markdown source file.
 */
function createSourceMapAttributes(hastNode, markdown) {
  const { position } = hastNode;

  return position && position.end
    ? {
        sourceMapKey: `${position.start.offset}:${position.end.offset}`,
        sourceMarkdown: markdown.substring(position.start.offset, position.end.offset),
      }
    : {};
}

/**
 * Creates a function that resolves the attributes
 * of a ProseMirror node based on a hast node.
 *
 * @param {Object} params Parameters
 * @param {String} params.markdown Markdown source from which the AST was generated
 * @param {Object} params.attributeTransformer An object that allows applying a transformation
 * function to all the attributes listed in the attributes property.
 * @param {Array} params.attributeTransformer.attributes A list of attributes names
 * that the getAttrs function should apply the transformation
 * @param {Function} params.attributeTransformer.transform A function that applies
 * a transform operation on an attribute value.
 * @returns A `getAttrs` function
 */
const getAttrsFactory = ({ attributeTransformer, markdown }) =>
  /**
   * Compute ProseMirror node’s attributes from a Hast node.
   * By default, this function includes sourcemap position
   * information in the object returned.
   *
   * Other attributes are retrieved by invoking a getAttrs
   * function provided by the ProseMirror node factory spec.
   *
   * @param {Object} proseMirrorNodeSpec ProseMirror node spec object
   * @param {Object} hastNode A hast node
   * @param {Array} hastParents All the ancestors of the hastNode
   * @param {String} markdown Markdown source file’s content
   * @returns An object that contains a ProseMirror node’s attributes
   */
  function getAttrs(proseMirrorNodeSpec, hastNode, hastParents) {
    const { getAttrs: specGetAttrs } = proseMirrorNodeSpec;
    const attributes = {
      ...(isFunction(specGetAttrs) ? specGetAttrs(hastNode, hastParents, markdown) : {}),
    };
    const { transform } = attributeTransformer;

    return {
      ...createSourceMapAttributes(hastNode, markdown),
      ...mapValues(attributes, (attributeValue, attributeName) =>
        transform(attributeName, attributeValue, hastNode),
      ),
    };
  };

/**
 * Keeps track of the Hast -> ProseMirror conversion process.
 *
 * When the `openNode` method is invoked, it adds the node to a stack
 * data structure. When the `closeNode` method is invoked, it removes the
 * last element from the Stack, creates a ProseMirror node, and adds that
 * ProseMirror node to the previous node in the Stack.
 *
 * For example, given a Hast tree with three levels of nodes:
 *
 * - blockquote
 *   - paragraph
 *     - text
 *
 * 3. text
 * 2. paragraph
 * 1. blockquote
 *
 * Calling `closeNode` will fold the text node into paragraph. A 2nd
 * call to this method will fold "paragraph" into "blockquote".
 *
 * Mark state
 *
 * When the `openMark` method is invoked, this class adds the Mark to a `MarkSet`
 * object. When a text node is added, it assigns all the opened marks to that text
 * node and cleans the marks. It takes care of merging text nodes with the same
 * set of marks as well.
 */
class HastToProseMirrorConverterState {
  constructor() {
    this.stack = [];
    this.marks = Mark.none;
  }

  /**
   * Gets the first element of the node stack
   */
  get top() {
    return this.stack[this.stack.length - 1];
  }

  get topNode() {
    return this.findInStack((item) => item.type === 'node');
  }

  /**
   * Detects if the node stack is empty
   */
  get empty() {
    return this.stack.length === 0;
  }

  findInStack(fn) {
    const last = this.stack.length - 1;

    for (let i = last; i >= 0; i -= 1) {
      const item = this.stack[i];

      if (fn(item) === true) {
        return item;
      }
    }

    return null;
  }

  /**
   * Creates a text node and adds it to
   * the top node in the stack.
   *
   * It applies the marks stored temporarily
   * by calling the `addMark` method. After
   * the text node is added, it clears the mark
   * set afterward.
   *
   * If the top block node has a text
   * node with the same set of marks as the
   * text node created, this method merges
   * both text nodes
   *
   * @param {ProseMirror.Schema} schema ProseMirror schema
   * @param {String} text Text
   * @returns
   */
  addText(schema, text) {
    if (!text) return;
    const nodes = this.topNode?.content;
    const last = nodes[nodes.length - 1];
    const node = schema.text(text, this.marks);
    const merged = maybeMerge(last, node);

    if (last && merged) {
      nodes[nodes.length - 1] = merged;
    } else {
      nodes.push(node);
    }
  }

  /**
   * Adds a mark to the set of marks stored temporarily
   * until an inline node is created.
   * @param {https://prosemirror.net/docs/ref/#model.MarkType} schemaType Mark schema type
   * @param {https://github.com/syntax-tree/hast#nodes} hastNode AST node that the mark is based on
   * @param {Object} attrs Mark attributes
   * @param {Object} factorySpec Specifications on how th mark should be created
   */
  openMark(schemaType, hastNode, attrs, factorySpec) {
    const mark = schemaType.create(attrs);
    this.stack.push({
      type: 'mark',
      mark,
      attrs,
      hastNode,
      factorySpec,
    });

    this.marks = mark.addToSet(this.marks);
  }

  /**
   * Removes a mark from the list of active marks that
   * are applied to inline nodes.
   */
  closeMark() {
    const { mark } = this.stack.pop();

    this.marks = mark.removeFromSet(this.marks);
  }

  /**
   * Adds a node to the stack data structure.
   *
   * @param {https://prosemirror.net/docs/ref/#model.NodeType} schemaType ProseMirror Schema for the node
   * @param {https://github.com/syntax-tree/hast#nodes} hastNode Hast node from which the ProseMirror node will be created
   * @param {*} attrs Node’s attributes
   * @param {*} factorySpec The factory spec used to create the node factory
   */
  openNode(schemaType, hastNode, attrs, factorySpec) {
    this.stack.push({
      type: 'node',
      schemaType,
      attrs,
      content: [],
      hastNode,
      factorySpec,
    });
  }

  /**
   * Removes the top ProseMirror node from the
   * conversion stack and adds the node to the
   * previous element.
   */
  closeNode() {
    const { schemaType, attrs, content, factorySpec } = this.stack.pop();
    const node =
      factorySpec.type === 'inline' && this.marks.length
        ? schemaType.createAndFill(attrs, content, this.marks)
        : schemaType.createAndFill(attrs, content);

    if (!node) {
      /*
      When the node returned by `createAndFill` is null is because the `content` passed as a parameter
      doesn’t conform with the document schema. We are handling the most likely scenario here that happens
      when a paragraph is inside another paragraph.

      This scenario happens when the converter encounters a mark wrapping one or more paragraphs.
      In this case, the converter will wrap the mark in a paragraph as well because ProseMirror does
      not allow marks wrapping block nodes or being direct children of certain nodes like the root nodes
      or list items.
      */
      if (
        schemaType.name === 'paragraph' &&
        content.some((child) => child.type.name === 'paragraph')
      ) {
        this.topNode.content.push(...content);
      }
      return null;
    }

    if (!this.empty) {
      this.topNode.content.push(node);
    }

    return node;
  }

  closeUntil(hastNode) {
    while (hastNode !== this.top?.hastNode) {
      if (this.top.type === 'node') {
        this.closeNode();
      } else {
        this.closeMark();
      }
    }
  }

  buildDoc() {
    let doc;

    do {
      if (this.top.type === 'node') {
        doc = this.closeNode();
      } else {
        this.closeMark();
      }
    } while (!this.empty);

    return doc;
  }
}

/**
 * Create ProseMirror node/mark factories based on one or more
 * factory specifications.
 *
 * Note: Read `createProseMirrorDocFromMdastTree` documentation
 * for instructions about how to define these specifications.
 *
 * @param {model.ProseMirrorSchema} schema A ProseMirror schema used to create the
 * ProseMirror nodes and marks.
 * @param {Object} proseMirrorFactorySpecs ProseMirror nodes factory specifications.
 * @param {String} markdown Markdown source file’s content
 *
 * @returns An object that contains ProseMirror node factories
 */
const createProseMirrorNodeFactories = (
  schema,
  proseMirrorFactorySpecs,
  attributeTransformer,
  markdown,
) => {
  const getAttrs = getAttrsFactory({ attributeTransformer, markdown });
  const factories = {
    root: {
      selector: 'root',
      wrapInParagraph: true,
      handle: (state, hastNode) =>
        state.openNode(schema.topNodeType, hastNode, NO_ATTRIBUTES, factories.root),
    },
    text: {
      selector: 'text',
      handle: (state, hastNode, parent) => {
        const found = state.findInStack((node) => isFunction(node.factorySpec.processText));
        const { value: text } = hastNode;

        if (/^\s+$/.test(text)) {
          return;
        }

        state.closeUntil(parent);
        state.addText(schema, found ? found.factorySpec.processText(text) : text);
      },
    },
  };
  for (const [proseMirrorName, factorySpec] of Object.entries(proseMirrorFactorySpecs)) {
    const factory = {
      ...factorySpec,
    };

    if (factorySpec.type === 'block') {
      factory.handle = (state, hastNode, parent) => {
        const nodeType = schema.nodeType(proseMirrorName);

        state.closeUntil(parent);
        state.openNode(nodeType, hastNode, getAttrs(factory, hastNode, parent), factory);
      };
    } else if (factory.type === 'inline') {
      const nodeType = schema.nodeType(proseMirrorName);
      factory.handle = (state, hastNode, parent) => {
        state.closeUntil(parent);
        state.openNode(nodeType, hastNode, getAttrs(factory, hastNode, parent), factory);
        // Inline nodes do not have children therefore they are immediately closed
        state.closeNode();
      };
    } else if (factory.type === 'mark') {
      const markType = schema.marks[proseMirrorName];
      factory.handle = (state, hastNode, parent) => {
        state.openMark(markType, hastNode, getAttrs(factory, hastNode, parent), factory);
      };
    } else if (factory.type === 'ignore') {
      factory.handle = noop;
    } else {
      throw new RangeError(
        `Unrecognized ProseMirror object type ${JSON.stringify(factorySpec.type)}`,
      );
    }

    factories[proseMirrorName] = factory;
  }

  return factories;
};

const findFactory = (hastNode, ancestors, factories) =>
  Object.entries(factories).find(([, factorySpec]) => {
    const { selector } = factorySpec;

    return isFunction(selector)
      ? selector(hastNode, ancestors)
      : [hastNode.tagName, hastNode.type].includes(selector);
  })?.[1];

const findParent = (ancestors, parent) => {
  if (isString(parent)) {
    return ancestors.reverse().find((ancestor) => ancestor.tagName === parent);
  }

  return ancestors[ancestors.length - 1];
};

const resolveNodePosition = (textNode) => {
  const { position, value, type } = textNode;

  if (type !== 'text' || (!position.start && !position.end) || (position.start && position.end)) {
    return textNode.position;
  }

  const span = value.length - 1;

  if (position.start && !position.end) {
    const { start } = position;

    return {
      start,
      end: {
        row: start.row,
        column: start.column + span,
        offset: start.offset + span,
      },
    };
  }

  const { end } = position;

  return {
    start: {
      row: end.row,
      column: end.column - span,
      offset: end.offset - span,
    },
    end,
  };
};

const removeEmptyTextNodes = (nodes) =>
  nodes.filter(
    (node) => node.type !== 'text' || (node.type === 'text' && !/^\s+$/.test(node.value)),
  );

const wrapInlineElements = (nodes, wrappableTags) =>
  nodes.reduce((children, child) => {
    const previous = children[children.length - 1];

    if (
      child.type === 'comment' ||
      (child.type !== 'text' && !wrappableTags.includes(child.tagName))
    ) {
      return [...children, child];
    }

    const wrapperExists = previous?.properties?.wrapper;

    if (wrapperExists) {
      const wrapper = previous;

      wrapper.position.end = child.position.end;
      wrapper.children.push(child);

      return children;
    }

    const wrapper = {
      type: 'element',
      tagName: 'p',
      position: resolveNodePosition(child),
      children: [child],
      properties: { wrapper: true },
    };

    return [...children, wrapper];
  }, []);

/**
 * Converts a Hast AST to a ProseMirror document based on a series
 * of specifications that describe how to map all the nodes of the former
 * to ProseMirror nodes or marks.
 *
 * The specification object describes how to map a Hast node to a ProseMirror node or mark.
 * The converter will trigger an error if it doesn’t find a specification
 * for a Hast node while traversing the AST.
 *
 * The object should have the following shape:
 *
 * {
 *   [ProseMirrorNodeOrMarkName]: {
 *     type: 'block' | 'inline' | 'mark',
 *     selector: String | hastNode -> Boolean,
 *     ...configurationOptions
 *   }
 * }
 *
 * Where each property in the object represents a HAST node with a given tag name, for example:
 *
 *  {
 *    horizontalRule: {
 *      type: 'block',
 *      selector: 'hr',
 *    },
 *    heading: {
 *      type: 'block',
 *      selector: (hastNode) => ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(hastNode),
 *    },
 *    bold: {
 *      type: 'mark'
 *      selector: (hastNode) => ['b', 'strong'].includes(hastNode),
 *    },
 *    // etc
 *  }
 *
 *
 * Configuration options
 * ----------------------
 *
 * You can customize the conversion process for every node or mark
 * setting the following properties in the specification object:
 *
 * **type**
 *
 * The `type` property should have one of following three values:
 *
 * 1. "block": A ProseMirror node that contains one or more children.
 * 2. "inline": A ProseMirror node that doesn’t contain any children although
 *    it can have inline content like an image or a mention object.
 * 3. "mark": A ProseMirror mark.
 * 4. "ignore": A hast node that should be ignored and won’t be mapped to a
 *     ProseMirror node.
 *
 * **selector**
 *
 * The `selector` property matches a HastNode to a ProseMirror node or
 * Mark. If you assign a string value to this property, the converter
 * will match the first hast node with a `tagName` or `type` property
 * that equals the string value.
 *
 * If you assign a function, the converter will invoke the function with
 * the hast node and its ancestors. The function should return `true`
 * if the hastNode matches the custom criteria implemented in the function
 *
 * **getAttrs**
 *
 * Computes a ProseMirror node or mark attributes. The converter will invoke
 * `getAttrs` with the following parameters:
 *
 * 1. hastNode: The hast node
 * 2. hasParents: All the hast node’s ancestors up to the root node
 * 3. source: Markdown source file’s content
 *
 * **wrapInParagraph**
 *
 * This property only applies to block nodes. If a block node contains inline
 * elements like text, images, links, etc, the converter will wrap those inline
 * elements in a paragraph. This is useful for ProseMirror block
 * nodes that don’t allow text directly such as list items and tables.
 *
 * **processText**
 *
 * This property only applies to block nodes. If a block node contains text,
 * it allows applying a processing function to that text. This is useful when
 * you can transform the text node, i.e trim(), substring(), etc.
 *
 * **parent**
 *
 * Specifies what is the node’s parent. This is useful when the node’s parent is not
 * its direct ancestor in Abstract Syntax Tree. For example, imagine that you want
 * to make <tr> elements a direct children of tables and skip `<thead>` and `<tbody>`
 * altogether.
 *
 * @param {model.Document_Schema} params.schema A ProseMirror schema that specifies the shape
 * of the ProseMirror document.
 * @param {Object} params.factorySpec A factory specification as described above
 * @param {Hast} params.tree https://github.com/syntax-tree/hast
 * @param {String} params.source Markdown source from which the MDast tree was generated
 *
 * @returns A ProseMirror document
 */
export const createProseMirrorDocFromMdastTree = ({
  schema,
  factorySpecs,
  wrappableTags,
  tree,
  attributeTransformer,
  markdown,
}) => {
  const proseMirrorNodeFactories = createProseMirrorNodeFactories(
    schema,
    factorySpecs,
    attributeTransformer,
    markdown,
  );
  const state = new HastToProseMirrorConverterState();

  visitParents(tree, (hastNode, ancestors) => {
    const factory = findFactory(hastNode, ancestors, proseMirrorNodeFactories);

    if (!factory) {
      return SKIP;
    }

    const parent = findParent(ancestors, factory.parent);

    if (factory.wrapInParagraph) {
      /**
       * Modifying parameters is a bad practice. For performance reasons,
       * the author of the unist-util-visit-parents function recommends
       * modifying nodes in place to avoid traversing the Abstract Syntax
       * Tree more than once
       */
      // eslint-disable-next-line no-param-reassign
      hastNode.children = wrapInlineElements(
        removeEmptyTextNodes(hastNode.children),
        wrappableTags,
      );
    }

    factory.handle(state, hastNode, parent);

    return true;
  });

  return state.buildDoc();
};