Comparing version 0.1.0 to 0.2.0
@@ -148,6 +148,2 @@ (function() { | ||
// slash = if sigil is '<' then '' else '/' | ||
// return "<#{tagname}#{slash}>" if atxt is '' | ||
// return "<#{tagname}#{atxt}#{slash}>" | ||
//=========================================================================================================== | ||
@@ -154,0 +150,0 @@ // PARSING |
@@ -1,7 +0,100 @@ | ||
// Generated by CoffeeScript 2.5.0 | ||
(function() { | ||
'use strict'; | ||
var CND, alert, badge, cast, debug, echo, help, hyphenator, info, isa, log, rpr, type_of, types, urge, validate, warn, whisper; | ||
//########################################################################################################### | ||
CND = require('cnd'); | ||
rpr = CND.rpr; | ||
badge = 'INTERTEXT/HYPHENATION'; | ||
log = CND.get_logger('plain', badge); | ||
info = CND.get_logger('info', badge); | ||
whisper = CND.get_logger('whisper', badge); | ||
alert = CND.get_logger('alert', badge); | ||
debug = CND.get_logger('debug', badge); | ||
warn = CND.get_logger('warn', badge); | ||
help = CND.get_logger('help', badge); | ||
urge = CND.get_logger('urge', badge); | ||
echo = CND.echo.bind(CND); | ||
//........................................................................................................... | ||
types = require('./types'); | ||
({isa, validate, cast, type_of} = types); | ||
hyphenator = null; | ||
//----------------------------------------------------------------------------------------------------------- | ||
/* thx to https://stackoverflow.com/a/881111/7568091, https://jsperf.com/performance-of-match-vs-split */ | ||
this.soft_hyphen_chr = '\u00ad'; | ||
this.soft_hyphen_pattern = /\u00ad/g; | ||
this.count_soft_hyphens = function(text) { | ||
return (text.split(this.soft_hyphen_chr)).length - 1; | ||
}; | ||
this.reveal_hyphens = function(text, replacement = '-') { | ||
return text.replace(this.soft_hyphen_pattern, replacement); | ||
}; | ||
this.hyphenate = function(text) { | ||
return (hyphenator != null ? hyphenator : hyphenator = this.new_hyphenator())(text); | ||
}; | ||
//----------------------------------------------------------------------------------------------------------- | ||
this.new_hyphenator = function() { | ||
/* https://github.com/mnater/Hyphenopoly */ | ||
/* return value of call to `config()` is hyphenation function when `require` contain one element, a map | ||
from language codes to functions otherwise; this we fix here: */ | ||
/* see https://github.com/mnater/Hyphenopoly > docs > Setup.md */ | ||
var H9Y, _hyphenators/* also available as per-language setting */, settings; | ||
H9Y = require('hyphenopoly/hyphenopoly.module'); | ||
settings = { | ||
// hyphen: '\u00ad' | ||
// "exceptions": { | ||
// "de": "Algo-rithmus", | ||
// "global": "Silben-trennung" | ||
// "exceptions": {"de": "Algo-rithmus, Algo-rithmus"}, | ||
// exceptions: {"global": "Silben-trennung"}, | ||
sync: true, | ||
require: ['en-us'], // [ "de", "en-us"], | ||
orphanControl: 1/* allow orphans */, | ||
compound: 'auto'/* all, auto, hyphen; `all` inserts ZWSP after existing hyphen */, | ||
normalize: false/* if true, transforms text to some kind of Unicode normal form */, | ||
mixedCase: true, | ||
minWordLength: 4, | ||
leftmin: 2/* also available as per-language setting */, | ||
rightmin: 2 | ||
}; | ||
_hyphenators = H9Y.config(settings); | ||
validate.function(_hyphenators); | ||
return _hyphenators; | ||
}; | ||
// switch ( type = type_of _hyphenators ) | ||
// when 'function' then hyphenators = new Map(); hyphenators.set 'en-us', _hyphenators | ||
// when 'map' then null | ||
// else throw new Error "^3464^ unknown hyphenators type #{rpr type}" | ||
// return hyphenators.get 'en-us' | ||
/* | ||
collection of words that are not satisfactorily hyphenated | ||
to be added to an exceptions dictionary | ||
process | ||
su-per-cal-ifrag-ilis-tic | ||
*/ | ||
}).call(this); | ||
//# sourceMappingURL=hyphenation.js.map |
@@ -45,2 +45,9 @@ (function() { | ||
/* | ||
#........................................................................................................... | ||
_format = require 'number-format.js' | ||
format_float = ( x ) -> _format '#,##0.000', x | ||
format_integer = ( x ) -> _format '#,##0.', x | ||
format_as_percentage = ( x ) -> _format '#,##0.00', x * 100 | ||
*/ | ||
//=========================================================================================================== | ||
@@ -53,3 +60,2 @@ | ||
class Intertext extends Multimix { | ||
// @include ( require './outliner.mixin' ), { overwrite: false, } | ||
// @extend MAIN, { overwrite: false, } | ||
@@ -61,2 +67,4 @@ | ||
this.HTML = require('./html'); | ||
this.HYPH = require('./hyphenation'); | ||
this.SLABS = require('./slabs'); | ||
if (target != null) { | ||
@@ -63,0 +71,0 @@ this.export(target); |
@@ -861,2 +861,25 @@ (function() { | ||
[ | ||
"<p>here and<br>there</p>", | ||
[ | ||
{ | ||
"$key": "<p" | ||
}, | ||
{ | ||
"text": "here and", | ||
"$key": "^text" | ||
}, | ||
{ | ||
"$key": "<br" | ||
}, | ||
{ | ||
"text": "there", | ||
"$key": "^text" | ||
}, | ||
{ | ||
"$key": ">p" | ||
} | ||
], | ||
null | ||
], | ||
[ | ||
"<p>here and<br/>there</p>", | ||
@@ -1347,2 +1370,63 @@ [ | ||
//----------------------------------------------------------------------------------------------------------- | ||
this["demo"] = function(T, done) { | ||
var d, datoms, i, len, ref, text; | ||
text = `<!DOCTYPE html> | ||
<h1><strong>CHAPTER VI.</strong> <name ref=hd553>Humpty Dumpty</h1> | ||
<p id=p227>However, the egg only got larger and larger, and <em>more and more human</em>:<br> | ||
when she had come within a few yards of it, she saw that it had eyes and a nose and mouth; and when she | ||
had come close to it, she saw clearly that it was <name ref=hd556>HUMPTY DUMPTY</name> himself. ‘It can’t | ||
be anybody else!’ she said to herself.<br/> | ||
‘I’m as certain of it, as if his name were written all over his face.’ | ||
`; | ||
ref = datoms = HTML.html_as_datoms(text); | ||
for (i = 0, len = ref.length; i < len; i++) { | ||
d = ref[i]; | ||
echo(jr(d)); | ||
} | ||
echo('-'.repeat(108)); | ||
echo(((function() { | ||
var j, len1, results; | ||
results = []; | ||
for (j = 0, len1 = datoms.length; j < len1; j++) { | ||
d = datoms[j]; | ||
results.push(HTML.datom_as_html(d)); | ||
} | ||
return results; | ||
})()).join('')); | ||
//......................................................................................................... | ||
done(); | ||
return null; | ||
}; | ||
//----------------------------------------------------------------------------------------------------------- | ||
this["demo (buffer)"] = function(T, done) { | ||
var buffer, d, datoms, i, len, ref, text; | ||
text = `<!DOCTYPE html> | ||
<h1><strong>CHAPTER VI.</strong> <name ref=hd553>Humpty Dumpty</h1>`; | ||
buffer = Buffer.from(text); | ||
debug('^80009^', buffer); | ||
ref = datoms = HTML.html_as_datoms(buffer); | ||
for (i = 0, len = ref.length; i < len; i++) { | ||
d = ref[i]; | ||
echo(jr(d)); | ||
} | ||
echo('-'.repeat(108)); | ||
echo(((function() { | ||
var j, len1, results; | ||
results = []; | ||
for (j = 0, len1 = datoms.length; j < len1; j++) { | ||
d = datoms[j]; | ||
results.push(HTML.datom_as_html(d)); | ||
} | ||
return results; | ||
})()).join('')); | ||
//......................................................................................................... | ||
done(); | ||
return null; | ||
}; | ||
//########################################################################################################### | ||
@@ -1352,4 +1436,6 @@ if (module === require.main) { | ||
// await @_demo() | ||
test(this); | ||
return help('ok'); | ||
// test @ | ||
help('ok'); | ||
// test @[ "demo" ] | ||
return test(this["demo (buffer)"]); | ||
})(); | ||
@@ -1356,0 +1442,0 @@ } |
{ | ||
"name": "intertext", | ||
"version": "0.1.0", | ||
"version": "0.2.0", | ||
"description": "Services for Recurrent Text-related Tasks", | ||
@@ -34,3 +34,5 @@ "main": "lib/main.js", | ||
"guy-test": "^1.4.1", | ||
"hyphenopoly": "^4.0.0", | ||
"intertype": "^3.1.0", | ||
"linebreak": "^1.0.2", | ||
"multimix": "^2.1.1", | ||
@@ -37,0 +39,0 @@ "steampipes": "^3.5.1" |
160
README.md
@@ -10,4 +10,6 @@ <!-- START doctoc generated TOC please keep comment here to allow auto update --> | ||
- [Turning Texts into "Slabs"](#turning-texts-into-slabs) | ||
- [HTML Parsing](#html-parsing) | ||
- [HTML Generation](#html-generation) | ||
- [HTML](#html) | ||
- [HTML Parsing](#html-parsing) | ||
- [HTML Generation](#html-generation) | ||
- [Example: HTML Parsing and HTML Generation](#example-html-parsing-and-html-generation) | ||
- [Codepoint Characterization](#codepoint-characterization) | ||
@@ -44,6 +46,14 @@ - [Benchmarks](#benchmarks) | ||
see jzr/benchmarks/src/hyphenation/main.coffee | ||
see jzr/benchmarks/README.md | ||
probably using `mnater/hyphenopoly` | ||
Implemented with [`mnater/hyphenopoly`](https://github.com/mnater/Hyphenopoly). | ||
* `INTERTEXT.HYPH.hyphenate = ( text ) ->`: return the text with soft hyphens (U+00ad) inserted. For | ||
languages other than US English, `INTERTEXT.HYPH.new_hyphenator = ( settings ) ->` may in a future version | ||
be used to obtain a custom hyphenation function. | ||
* `INTERTEXT.HYPH.count_soft_hyphens = ( text ) ->`: Count occurances of U+00ad in `text`. | ||
* `INTERTEXT.HYPH.reveal_hyphens = ( text, replacement = '-' ) ->`: Replace all soft hyphens with | ||
`replacement`. | ||
### Turning Texts into "Slabs" | ||
@@ -67,16 +77,138 @@ | ||
## HTML Parsing | ||
## HTML | ||
see jzr/benchmarks/src/streaming-html-parsers/main.coffee | ||
see jzr/benchmarks/src/streaming-html-parsers/mkts-tagparser.coffee | ||
### HTML Parsing | ||
probably using `atlassubbed/atlas-html-stream` | ||
HTML parsing uses [`atlassubbed/atlas-html-stream`](https://github.com/atlassubbed/atlas-html-stream) to | ||
turn HTML5 texts into series of [datoms](https://github.com/loveencounterflow/datom). Two HTML formats are | ||
supported: | ||
* plain HTML5, and | ||
* MKTScript, a nascent crossbreed of a kind-of MarkDown with HTMLish tags. | ||
## HTML Generation | ||
Unless you know what you're after you'll probably want to use the plain HTML5 flavor. | ||
Successor to `coffeenode-teacup` | ||
After `{ HTML, } = require 'intertext'`, use one of these methods: | ||
Serialization implemented in [Datom](https://github.com/loveencounterflow/datom) | ||
* `HTML.html_as_datoms = ( text ) ->` to turn HTML fragments or entire documents into a list of datoms, or | ||
* `HTML.mkts_html_as_datoms = ( text ) ->` to do the same with MKTScript. | ||
Both methods work pretty much the same and are the inverse operations to `HTML.datom_as_html()`: | ||
* All opening tags will be turned into datoms whose `$key` is the tagname prefixed with the left pointy | ||
bracket as sigil, and attribute name/value pairs becoming properties of the datom. | ||
* Closing tags will be turned into datoms whose `$key` is the tagname prefixed with the right pointy bracket | ||
as sigil. | ||
* For plain HTML, 'lone'/'self-closing' tags will be treated like an opening tag immediately followed by a | ||
closing tag. as sigil. | ||
* For MKTScript, 'lone'/'self-closing' tags will be turned into datoms whose `$key` is the tagname prefixed | ||
with the caret as sigil. | ||
* Intermittent text will be turned into datoms whose `$key` is `^text` and whose contents are stored under | ||
the `text` property. | ||
* Whitespace will be preserved. | ||
### HTML Generation | ||
<!-- Successor to `coffeenode-teacup`? --> | ||
`{ HTML, } = require 'intertext'` | ||
* `HTML.datom_as_html = ( d ) ->` | ||
* For the tagname: | ||
* `d.$key` will become the tagname | ||
* the tagname must conform to the [XML tagname restrictions](https://www.w3.org/TR/xml) | ||
* For the attributes: | ||
* all facets with value `true` (the boolean, not the text) will be turned into 'lone attributes', such | ||
that `{ $key: '<p', contenteditable: true, }` will result in `<p contenteditable>` | ||
* facet values are subject to HTML5 attribute value escaping rules as detailed in | ||
https://mathiasbynens.be/notes/unquoted-attribute-values | ||
* where permitted, values will be left unquoted ('naked'); where necessary, values will be surrounded | ||
by `'` (single quotes) | ||
* facets with an empty string are not treated specially; per attribute value escaping rules, they will | ||
result in `''` (two single quotes) | ||
* all keys that start with a `$` will be ignored | ||
* if `d.$value` is an object, its facets will be turned into HTML attributes; all other keys are ignored | ||
* Open questions: | ||
* how to treat system-level names (sigils `[`, `~`, `]`)? | ||
* ignore? | ||
* as comments? | ||
* as prefixed/namespaced tags? | ||
* how to treat datom keys that contain hyphens, underscores? | ||
* turn underscores into hyphens? | ||
### Example: HTML Parsing and HTML Generation | ||
```coffee | ||
text = """<!DOCTYPE html> | ||
<h1><strong>CHAPTER VI.</strong> <name ref=hd553>Humpty Dumpty</h1> | ||
<p id=p227>However, the egg only got larger and larger, and <em>more and more human</em>:<br> | ||
when she had come within a few yards of it, she saw that it had eyes and a nose and mouth; and when she | ||
had come close to it, she saw clearly that it was <name ref=hd556>HUMPTY DUMPTY</name> himself. ‘It can’t | ||
be anybody else!’ she said to herself.<br/> | ||
‘I’m as certain of it, as if his name were written all over his face.’ | ||
""" | ||
for d in HTML.html_as_datoms text | ||
log JSON.stringify d | ||
log '-'.repeat 108 | ||
log ( HTML.datom_as_html d for d in datoms ).join '' | ||
``` | ||
... will produce: | ||
```json | ||
{ "$key": "^doctype", "$value": "html", } | ||
{ "$key": "^text", "text": "\n", } | ||
{ "$key": "<h1", } | ||
{ "$key": "<strong", } | ||
{ "$key": "^text", "text": "CHAPTER VI.", } | ||
{ "$key": ">strong", } | ||
{ "$key": "^text", "text": " ", } | ||
{ "$key": "<name", "ref": "hd553", } | ||
{ "$key": "^text", "text": "Humpty Dumpty", } | ||
{ "$key": ">h1", } | ||
{ "$key": "^text", "text": "\n\n", } | ||
{ "$key": "<p", "id": "p227", } | ||
{ "$key": "^text", "text": "However, the egg only got larger and larger, and ", } | ||
{ "$key": "<em", } | ||
{ "$key": "^text", "text": "more and more human", } | ||
{ "$key": ">em", } | ||
{ "$key": "^text", "text": ":", } | ||
{ "$key": "<br", } | ||
{ "$key": "^text", "text": "\n\nwhen she had come within ... she saw clearly that it was ", } | ||
{ "$key": "<name", "ref": "hd556", } | ||
{ "$key": "^text", "text": "HUMPTY DUMPTY", } | ||
{ "$key": ">name", } | ||
{ "$key": "^text", "text": " himself. ‘It can’t\nbe anybody else!’ she said to herself.", } | ||
{ "$key": "<br", } | ||
{ "$key": ">br", } | ||
{ "$key": "^text", "text": "\n\n‘I’m as certain ... all over his face.’\n", } | ||
``` | ||
```html | ||
<!DOCTYPE html> | ||
<h1><strong>CHAPTER VI.</strong> <name ref=hd553>Humpty Dumpty</h1> | ||
<p id=p227>However, the egg only got larger and larger, and <em>more and more human</em>:<br> | ||
when she had come within a few yards of it, she saw that it had eyes and a nose and mouth; and when she | ||
had come close to it, she saw clearly that it was <name ref=hd556>HUMPTY DUMPTY</name> himself. ‘It can’t | ||
be anybody else!’ she said to herself.<br></br> | ||
‘I’m as certain of it, as if his name were written all over his face.’ | ||
``` | ||
As can be seen, no validation will be done, and the parser will happily produce events for unclosed and | ||
unbalanced closing tags. There is a minor issue with the `<br></br>` tag pair which will get resolved in | ||
a future version. | ||
## Codepoint Characterization | ||
@@ -169,3 +301,3 @@ | ||
``` | ||
hyphenopoly hypher | ||
hyphenopoly hypher | ||
————————————————————————————————————————————————————————— | ||
@@ -183,3 +315,3 @@ thun-der-storm’s thun-der-stor-m’s | ||
``` | ||
hyphenopoly hypher | ||
hyphenopoly hypher | ||
————————————————————————————————————————————————————————— | ||
@@ -186,0 +318,0 @@ Düssel-dorf Düs-sel-dorf |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
174917
30
2598
383
0
8
4
+ Addedhyphenopoly@^4.0.0
+ Addedlinebreak@^1.0.2
+ Addedbase64-js@0.0.8(transitive)
+ Addedhyphenopoly@4.12.0(transitive)
+ Addedlinebreak@1.1.0(transitive)
+ Addedpako@0.2.9(transitive)
+ Addedtiny-inflate@1.0.3(transitive)
+ Addedunicode-trie@2.0.0(transitive)