Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

datocms-html-to-structured-text

Package Overview
Dependencies
Maintainers
1
Versions
46
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

datocms-html-to-structured-text - npm Package Compare versions

Comparing version 0.1.0-alpha.26 to 0.1.0-alpha.27

dist/lib/handlers.js

56

dist/lib/index.js
"use strict";
/* eslint-disable @typescript-eslint/ban-ts-comment */
// @ts-nocheck
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {

@@ -44,10 +55,13 @@ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }

Object.defineProperty(exports, "__esModule", { value: true });
exports.hastToDast = exports.parse5ToDast = exports.htmlToDast = void 0;
exports.visitChildren = exports.visitNode = exports.hastToStructuredText = exports.parse5ToStructuredText = exports.htmlToStructuredText = void 0;
// @ts-ignore
var rehype_minify_whitespace_1 = __importDefault(require("rehype-minify-whitespace"));
var visit_node_1 = __importDefault(require("./lib/visit-node"));
var handlers_1 = require("./lib/handlers");
var visit_node_1 = __importDefault(require("./visit-node"));
exports.visitNode = visit_node_1.default;
var visit_children_1 = __importDefault(require("./visit-children"));
exports.visitChildren = visit_children_1.default;
var handlers_1 = require("./handlers");
var hast_util_from_parse5_1 = __importDefault(require("hast-util-from-parse5"));
var hast_util_from_dom_1 = __importDefault(require("hast-util-from-dom"));
function htmlToDast(html, settings) {
function htmlToStructuredText(html, settings) {
if (settings === void 0) { settings = {}; }

@@ -58,12 +72,12 @@ return __awaiter(this, void 0, void 0, function () {

if (typeof DOMParser === 'undefined') {
throw new Error('DOMParser is not available. Consider using `parse5ToDast` instead!');
throw new Error('DOMParser is not available. Consider using `parse5ToStructuredText` instead!');
}
document = new DOMParser().parseFromString(html, 'text/html');
tree = hast_util_from_dom_1.default(document);
return [2 /*return*/, hastToDast(tree, settings)];
return [2 /*return*/, hastToStructuredText(tree, settings)];
});
});
}
exports.htmlToDast = htmlToDast;
function parse5ToDast(document, settings) {
exports.htmlToStructuredText = htmlToStructuredText;
function parse5ToStructuredText(document, settings) {
if (settings === void 0) { settings = {}; }

@@ -74,11 +88,11 @@ return __awaiter(this, void 0, void 0, function () {

tree = hast_util_from_parse5_1.default(document);
return [2 /*return*/, hastToDast(tree, settings)];
return [2 /*return*/, hastToStructuredText(tree, settings)];
});
});
}
exports.parse5ToDast = parse5ToDast;
function hastToDast(tree, settings) {
exports.parse5ToStructuredText = parse5ToStructuredText;
function hastToStructuredText(tree, settings) {
if (settings === void 0) { settings = {}; }
return __awaiter(this, void 0, void 0, function () {
var createNode;
var createNode, rootNode;
return __generator(this, function (_a) {

@@ -96,10 +110,18 @@ switch (_a.label) {

return [4 /*yield*/, visit_node_1.default(createNode, tree, {
parentNodeType: 'root',
parentNode: null,
name: 'root',
frozenBaseUrl: null,
wrapText: true,
defaultHandlers: handlers_1.handlers,
handlers: Object.assign({}, handlers_1.handlers, settings.handlers || {}),
wrapText: true,
shared: __assign({ baseUrl: null, baseUrlFound: false }, (settings.shared || {})),
})];
case 1: return [2 /*return*/, _a.sent()];
case 1:
rootNode = _a.sent();
if (rootNode) {
return [2 /*return*/, {
schema: 'dast',
document: rootNode,
}];
}
return [2 /*return*/, null];
}

@@ -109,3 +131,3 @@ });

}
exports.hastToDast = hastToDast;
exports.hastToStructuredText = hastToStructuredText;
//# sourceMappingURL=index.js.map

@@ -1,3 +0,6 @@

import { Root, CreateNodeFunction, HastRootNode } from './lib/types';
import { Root, CreateNodeFunction, HastRootNode } from './types';
import visitNode from './visit-node';
import visitChildren from './visit-children';
import parse5 from 'parse5';
import { Document } from 'datocms-structured-text-utils';
export declare type Settings = Partial<{

@@ -8,4 +11,5 @@ newlines: boolean;

}>;
export declare function htmlToDast(html: string, settings?: Settings): Promise<Root>;
export declare function parse5ToDast(document: parse5.Document, settings?: Settings): Promise<Root>;
export declare function hastToDast(tree: HastRootNode, settings?: Settings): Promise<Root>;
export declare function htmlToStructuredText(html: string, settings?: Settings): Promise<Root | null>;
export declare function parse5ToStructuredText(document: parse5.Document, settings?: Settings): Promise<Root | null>;
export declare function hastToStructuredText(tree: HastRootNode, settings?: Settings): Promise<Document | null>;
export { visitNode, visitChildren };
{
"name": "datocms-html-to-structured-text",
"version": "0.1.0-alpha.26",
"version": "0.1.0-alpha.27",
"description": "Convert HTML (or Hast syntax tree) to a valid DatoCMS Structured Text Dast document",

@@ -34,3 +34,3 @@ "keywords": [

"dependencies": {
"datocms-structured-text-utils": "^0.1.0-alpha.26",
"datocms-structured-text-utils": "^0.1.0-alpha.27",
"extend": "^3.0.2",

@@ -49,3 +49,3 @@ "hast-util-from-dom": "^3.0.0",

},
"gitHead": "c2e311dcb0ec61dcccca96cccd881760b379209b"
"gitHead": "9138f478aedb5acdfe26f9575009250b313f4389"
}
# `html-to-structured-text`
> TODO: description
Convert HTML (or [Hast](https://github.com/syntax-tree/hast) syntax tree) to a valid DatoCMS Structured Text Dast document.
Dast stands for Dato Abstract Syntax Tree.
## Usage
The main utility is `htmlToStructuredText` which takes a string of HTML and transforms it into a valid Dast.
`htmlToStructuredText` returns a `Promise` that resolves with a `Dast`.
```js
import { htmlToStructuredText } from 'html-to-structured-text';
const html = `
<article>
<h1>DatoCMS</h1>
<p>The most complete, user-friendly and performant Headless CMS.</p>
</article>
`;
htmlToStructuredText(html).then((structuredText) => {
console.log(structuredText);
});
```
const { toDast } = require('html-to-structured-text');
// TODO: DEMONSTRATE API
`htmlToStructuredText` is meant to be used in a browser environment.
In Node.js you can use the `parse5ToStructuredText` helper which instead takes a document generated with `parse5`.
```js
import parse5 from 'parse5';
import { parse5ToStructuredText } from 'html-to-structured-text';
parse5ToStructuredText(
parse5.parse(html, {
sourceCodeLocationInfo: true,
}),
).then((structuredText) => {
console.log(structuredText);
});
```
Internally, both utilities work on a Hast. Should you have an Hast already you can use a third utility called `hastToDast`.
## Valid Dast
Dast is a strict format that is compliant with DatoCMS' Structured Text records. As such the resulting document is generally a simplified, content-centric version of the input HTML.
When possible, the library relies on semantic HTML to generate a valid Dast document.
The `datocms-structured-text-utils` package provides a `validate` utility to validate a value to make sure that the resulting tree is compatible with DatoCMS' Structured Text field.
```js
import { validate } from 'datocms-structured-text-utils';
// ...
htmlToStructuredText(html).then((structuredText) => {
const { valid, message } = validate(structuredText);
if (!valid) {
throw new Error(message);
}
});
```
We recommend to validate every Dast to avoid errors later when creating records.
## Advanced Usage
### Transforming Nodes
This library traverses a Hast tree and transforms supported nodes to Dast nodes. The transformation is done by working on a Hast node with a handler (async) function.
Handlers are associated to Hast nodes by `tagName` or `type` when `node.type !== 'element'` and look as follow:
```js
import { visitChildren } from 'html-to-structured-text';
// Handler for the <p> tag.
async function p(createDastNode, hastNode, context) {
return createDastNode('paragraph', {
children: await visitChildren(createDastNode, hastNode, context),
});
}
```
Handlers can return either a promise that resolves to a Dast node, an array of Dast Nodes or `undefined` to skip the current node.
To ensure that a valid Dast is generated the default handlers also check that the current `hastNode` is a valid Dast node for its parent and, if not, they ignore the current node and continue visiting its children.
Information about the parent Dast node name is available in `context.name`.
Please take a look at the [default handlers implementation](./handlers.ts) for examples.
The default handlers are available on `context.defaultHandlers`.
### context
Every handler receives a `context` object that includes the following information:
```js
export interface GlobalContext {
// Whether the library has found a <base> tag or should not look further.
// See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
baseUrlFound?: boolean;
// <base> tag url. This is used for resolving relative URLs.
baseUrl?: string;
}
export interface Context {
// The current parent Dast node type.
parentNodeType: NodeType;
// The parent Hast node.
parentNode: HastNode;
// A reference to the default handlers record (map).
defaultHandlers: Record<string, Handler<unknown>>;
// A reference to the current handlers - merged default + user handlers.
handlers: Record<string, Handler<unknown>>;
wrapText: boolean;
// Marks for span nodes.
marks?: Mark[];
// Prefix for language detection in code blocks.
// Detection is done on a class name eg class="language-html"
// Default is `language-`
codePrefix?: string;
// Properties in this object are avaliable to every handler as Context
// is not deeply cloned.
global: GlobalContext;
}
```
### Custom Handlers
It is possible to register custom handlers and override the default behavior via settings:
```js
import { paragraphHandler } from './customHandlers';
htmlToStructuredText(html, {
handlers: {
p: paragraphHandler,
},
}).then((structuredText) => {
console.log(structuredText);
});
```
It is **highly encouraged** to validate the Dast when using custom handlers as handlers are responsible for dictating valid parent-children relationships and therefore generating a tree that is compliant with DatoCMS' Structured Text.
## preprocessing
Because of the strictness of the Dast spec it is possible that some semantic or elements might be lost during the transformation.
To improve the final result, you might want to modify the Hast before it is transformed to Dast with the `preprocess` hook.
```js
import { findAll } from 'unist-utils-core';
const html = `
<p>convert this to an h1</p>
`;
htmlToStructuredText(html, {
preprocess: (tree) => {
// Transform <p> to <h1>
findAll(tree, (node) => {
if (node.type === 'element' && node.tagName === 'p') {
node.tagName = 'h1';
}
});
},
}).then((structuredText) => {
console.log(structuredText);
});
```
### Examples
<details>
<summary>Split a node that contains an image.</summary>
In Dast images can be presented as `Block` nodes but these are not allowed inside of `ListItem` nodes (ul/ol lists). In this example we will split the list in 3 pieces and lift up the image.
The same approach can be used to split other types of branches and lift up nodes to become root nodes.
```js
import { findAll } from 'unist-utils-core';
const html = `
<ul>
<li>item 1</li>
<li><div><img src="./img.png" alt></div></li>
<li>item 2</li>
</ul>
`;
const dast = await htmlToStructuredText(html, {
preprocess: (tree) => {
const liftedImages = new WeakSet();
const body = find(tree, (node) => node.tagName === 'body');
visit(body, (node, index, parents) => {
if (
!node ||
node.tagName !== 'img' ||
liftedImages.has(node) ||
parents.length === 1 // is a top level img
) {
return;
}
// remove image
const imgParent = parents[parents.length - 1];
imgParent.children.splice(index, 1);
let i = parents.length;
let splitChildrenIndex = index;
let childrenAfterSplitPoint = [];
while (--i > 0) {
// Example: i == 2
// [ 'body', 'div', 'h1' ]
const /* h1 */ parent = parents[i];
const /* div */ parentsParent = parents[i - 1];
// Delete the siblings after the image and save them in a variable
childrenAfterSplitPoint /* [ 'h1.2' ] */ = parent.children.splice(
splitChildrenIndex,
);
// parent.children is now == [ 'h1.1' ]
// parentsParent.children = [ 'h1' ]
splitChildrenIndex = parentsParent.children.indexOf(parent);
// splitChildrenIndex = 0
// If we reached the 'div' add the image's node
if (i === 1) {
splitChildrenIndex += 1;
parentsParent.children.splice(splitChildrenIndex, 0, node);
liftedImages.add(node);
}
splitChildrenIndex += 1;
// Create a new branch with childrenAfterSplitPoint if we have any i.e.
// <h1>h1.2</h1>
if (childrenAfterSplitPoint.length > 0) {
parentsParent.children.splice(splitChildrenIndex, 0, {
...parent,
children: childrenAfterSplitPoint,
});
}
// Remove the parent if empty
if (parent.children.length === 0) {
splitChildrenIndex -= 1;
parentsParent.children.splice(splitChildrenIndex, 1);
}
}
});
},
handlers: {
img: async (createNode, node, context) => {
// In a real scenario you would upload the image to Dato and get back an id.
const item = '123';
return createNode('block', {
item,
});
},
},
});
```
</details>
<details>
<summary>Lift up an image node</summary>
```js
const html = `
<ul>
<li>item 1</li>
<li><div><img src="./img.png" alt>item 2</div></li>
<li>item 3</li>
</ul>
`;
const dast = await htmlToStructuredText(html, {
preprocess: (tree) => {
findAll(tree, (node, index, parent) => {
if (node.tagName === 'img') {
// Add the image to the root's children.
tree.children.push(node);
// remove the image from the parent's children array.
parent.children.splice(index, 1);
return;
}
});
},
handlers: {
img: async (createNode, node, context) => {
// In a real scenario you would upload the image to Dato and get back an id.
const item = '123';
return createNode('block', {
item,
});
},
},
});
```
</details>
### Utilities
To work with Hast and Dast trees we recommend using the [unist-utils-core](https://www.npmjs.com/package/unist-utils-core) library.
## License
MIT

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc