MpHtmlParser.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /**
  2. * html 解析器
  3. * @tutorial https://github.com/jin-yufeng/Parser
  4. * @version 20200513
  5. * @author JinYufeng
  6. * @listens MIT
  7. */
  8. var cfg = require('./config.js'),
  9. blankChar = cfg.blankChar,
  10. CssHandler = require('./CssHandler.js'),
  11. windowWidth = uni.getSystemInfoSync().windowWidth;
  12. var emoji;
  13. class MpHtmlParser {
  14. constructor(data, options = {}) {
  15. this.attrs = {};
  16. this.CssHandler = new CssHandler(options.tagStyle, windowWidth);
  17. this.data = data;
  18. this.domain = options.domain;
  19. this.DOM = [];
  20. this.i = this.start = this.audioNum = this.imgNum = this.videoNum = 0;
  21. options.prot = (this.domain || '').includes('://') ? this.domain.split('://')[0] : 'http';
  22. this.options = options;
  23. this.state = this.Text;
  24. this.STACK = [];
  25. }
  26. parse() {
  27. if (emoji) this.data = emoji.parseEmoji(this.data);
  28. for (var c; c = this.data[this.i]; this.i++)
  29. this.state(c);
  30. if (this.state == this.Text) this.setText();
  31. while (this.STACK.length) this.popNode(this.STACK.pop());
  32. if (this.DOM.length) {
  33. this.DOM[0].PoweredBy = 'Parser';
  34. if (this.title) this.DOM[0].title = this.title;
  35. }
  36. return this.DOM;
  37. }
  38. // 设置属性
  39. setAttr() {
  40. var name = this.attrName.toLowerCase();
  41. if (cfg.trustAttrs[name]) {
  42. var val = this.attrVal;
  43. if (val) {
  44. if (name == 'src') this.attrs[name] = this.getUrl(this.decode(val, 'amp'));
  45. else if (name == 'href' || name == 'style') this.attrs[name] = this.decode(val, 'amp');
  46. else this.attrs[name] = val;
  47. } else if (cfg.boolAttrs[name]) this.attrs[name] = 'T';
  48. }
  49. this.attrVal = '';
  50. while (blankChar[this.data[this.i]]) this.i++;
  51. if (this.isClose()) this.setNode();
  52. else {
  53. this.start = this.i;
  54. this.state = this.AttrName;
  55. }
  56. }
  57. // 设置文本节点
  58. setText() {
  59. var back, text = this.section();
  60. if (!text) return;
  61. text = (cfg.onText && cfg.onText(text, () => back = true)) || text;
  62. if (back) {
  63. this.data = this.data.substr(0, this.start) + text + this.data.substr(this.i);
  64. let j = this.start + text.length;
  65. for (this.i = this.start; this.i < j; this.i++) this.state(this.data[this.i]);
  66. return;
  67. }
  68. if (!this.pre) {
  69. // 合并空白符
  70. var tmp = [];
  71. for (let i = text.length, c; c = text[--i];)
  72. if (!blankChar[c] || (!blankChar[tmp[0]] && (c = ' '))) tmp.unshift(c);
  73. text = tmp.join('');
  74. }
  75. this.siblings().push({
  76. type: 'text',
  77. text: this.decode(text)
  78. });
  79. }
  80. // 设置元素节点
  81. setNode() {
  82. var node = {
  83. name: this.tagName.toLowerCase(),
  84. attrs: this.attrs
  85. },
  86. close = cfg.selfClosingTags[node.name];
  87. this.attrs = {};
  88. if (!cfg.ignoreTags[node.name]) {
  89. this.matchAttr(node);
  90. if (!close) {
  91. node.children = [];
  92. if (node.name == 'pre' && cfg.highlight) {
  93. this.remove(node);
  94. this.pre = node.pre = true;
  95. }
  96. this.siblings().push(node);
  97. this.STACK.push(node);
  98. } else if (!cfg.filter || cfg.filter(node, this) != false)
  99. this.siblings().push(node);
  100. } else {
  101. if (!close) this.remove(node);
  102. else if (node.name == 'source') {
  103. var parent = this.parent();
  104. if (parent && (parent.name == 'video' || parent.name == 'audio') && node.attrs.src)
  105. parent.attrs.source.push(node.attrs.src);
  106. } else if (node.name == 'base' && !this.domain) this.domain = node.attrs.href;
  107. }
  108. if (this.data[this.i] == '/') this.i++;
  109. this.start = this.i + 1;
  110. this.state = this.Text;
  111. }
  112. // 移除标签
  113. remove(node) {
  114. var name = node.name,
  115. j = this.i;
  116. // 处理 svg
  117. var handleSvg = () => {
  118. var src = this.data.substring(j, this.i + 1);
  119. if (!node.attrs.xmlns) src = ' xmlns="http://www.w3.org/2000/svg"' + src;
  120. var i = j;
  121. while (this.data[j] != '<') j--;
  122. src = this.data.substring(j, i) + src;
  123. var parent = this.parent();
  124. if (node.attrs.width == '100%' && parent && (parent.attrs.style || '').includes('inline'))
  125. parent.attrs.style = 'width:300px;max-width:100%;' + parent.attrs.style;
  126. this.siblings().push({
  127. name: 'img',
  128. attrs: {
  129. src: 'data:image/svg+xml;utf8,' + src.replace(/#/g, '%23'),
  130. ignore: 'T'
  131. }
  132. })
  133. }
  134. if (node.name == 'svg' && this.data[j] == '/') return handleSvg(this.i++);
  135. while (1) {
  136. if ((this.i = this.data.indexOf('</', this.i + 1)) == -1) {
  137. if (name == 'pre' || name == 'svg') this.i = j;
  138. else this.i = this.data.length;
  139. return;
  140. }
  141. this.start = (this.i += 2);
  142. while (!blankChar[this.data[this.i]] && !this.isClose()) this.i++;
  143. if (this.section().toLowerCase() == name) {
  144. // 代码块高亮
  145. if (name == 'pre') {
  146. this.data = this.data.substr(0, j + 1) + cfg.highlight(this.data.substring(j + 1, this.i - 5), node.attrs) +
  147. this.data.substr(this.i - 5);
  148. return this.i = j;
  149. } else if (name == 'style')
  150. this.CssHandler.getStyle(this.data.substring(j + 1, this.i - 7));
  151. else if (name == 'title')
  152. this.title = this.data.substring(j + 1, this.i - 7);
  153. if ((this.i = this.data.indexOf('>', this.i)) == -1) this.i = this.data.length;
  154. if (name == 'svg') handleSvg();
  155. return;
  156. }
  157. }
  158. }
  159. // 处理属性
  160. matchAttr(node) {
  161. var attrs = node.attrs,
  162. style = this.CssHandler.match(node.name, attrs, node) + (attrs.style || ''),
  163. styleObj = {};
  164. if (attrs.id) {
  165. if (this.options.compress & 1) attrs.id = void 0;
  166. else if (this.options.useAnchor) this.bubble();
  167. }
  168. if ((this.options.compress & 2) && attrs.class) attrs.class = void 0;
  169. switch (node.name) {
  170. case 'a':
  171. case 'ad':
  172. this.bubble();
  173. break;
  174. // #ifdef APP-PLUS
  175. case 'iframe':
  176. case 'embed':
  177. this.bubble();
  178. break;
  179. // #endif
  180. case 'font':
  181. if (attrs.color) {
  182. styleObj['color'] = attrs.color;
  183. attrs.color = void 0;
  184. }
  185. if (attrs.face) {
  186. styleObj['font-family'] = attrs.face;
  187. attrs.face = void 0;
  188. }
  189. if (attrs.size) {
  190. var size = parseInt(attrs.size);
  191. if (size < 1) size = 1;
  192. else if (size > 7) size = 7;
  193. var map = ['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'];
  194. styleObj['font-size'] = map[size - 1];
  195. attrs.size = void 0;
  196. }
  197. break;
  198. case 'video':
  199. case 'audio':
  200. if (!attrs.id) attrs.id = node.name + (++this[`${node.name}Num`]);
  201. else this[`${node.name}Num`]++;
  202. if (node.name == 'video') {
  203. if (this.videoNum > 3)
  204. node.lazyLoad = 1;
  205. if (attrs.width) {
  206. styleObj.width = parseFloat(attrs.width) + (attrs.width.includes('%') ? '%' : 'px');
  207. attrs.width = void 0;
  208. }
  209. if (attrs.height) {
  210. styleObj.height = parseFloat(attrs.height) + (attrs.height.includes('%') ? '%' : 'px');
  211. attrs.height = void 0;
  212. }
  213. }
  214. attrs.source = [];
  215. if (attrs.src) attrs.source.push(attrs.src);
  216. if (!attrs.controls && !attrs.autoplay)
  217. console.warn(`存在没有 controls 属性的 ${node.name} 标签,可能导致无法播放`, node);
  218. this.bubble();
  219. break;
  220. case 'td':
  221. case 'th':
  222. if (attrs.colspan || attrs.rowspan)
  223. for (var k = this.STACK.length, item; item = this.STACK[--k];)
  224. if (item.name == 'table') {
  225. item.c = void 0;
  226. break;
  227. }
  228. }
  229. if (attrs.align) {
  230. styleObj['text-align'] = attrs.align;
  231. attrs.align = void 0;
  232. }
  233. // 压缩 style
  234. var styles = style.split(';');
  235. style = '';
  236. for (var i = 0, len = styles.length; i < len; i++) {
  237. var info = styles[i].split(':');
  238. if (info.length < 2) continue;
  239. let key = info[0].trim().toLowerCase(),
  240. value = info.slice(1).join(':').trim();
  241. if (value.includes('-webkit') || value.includes('-moz') || value.includes('-ms') || value.includes('-o') || value
  242. .includes(
  243. 'safe'))
  244. style += `;${key}:${value}`;
  245. else if (!styleObj[key] || value.includes('import') || !styleObj[key].includes('import'))
  246. styleObj[key] = value;
  247. }
  248. if (node.name == 'img') {
  249. if (attrs['data-src']) {
  250. attrs.src = attrs.src || attrs['data-src'];
  251. attrs['data-src'] = void 0;
  252. }
  253. if (attrs.src && !attrs.ignore) {
  254. if (this.bubble())
  255. attrs.i = (this.imgNum++).toString();
  256. else attrs.ignore = 'T';
  257. }
  258. if (attrs.ignore) styleObj['max-width'] = '100%';
  259. var width;
  260. if (styleObj.width) width = styleObj.width;
  261. else if (attrs.width) width = attrs.width.includes('%') ? attrs.width : attrs.width + 'px';
  262. if (width) {
  263. styleObj.width = width;
  264. attrs.width = '100%';
  265. if (parseInt(width) > windowWidth) {
  266. styleObj.height = '';
  267. if (attrs.height) attrs.height = void 0;
  268. }
  269. }
  270. if (styleObj.height) {
  271. attrs.height = styleObj.height;
  272. styleObj.height = '';
  273. } else if (attrs.height && !attrs.height.includes('%'))
  274. attrs.height += 'px';
  275. }
  276. for (var key in styleObj) {
  277. var value = styleObj[key];
  278. if (key.includes('flex') || key == 'order' || key == 'self-align') node.c = 1;
  279. // 填充链接
  280. if (value.includes('url')) {
  281. var j = value.indexOf('(');
  282. if (j++ != -1) {
  283. while (value[j] == '"' || value[j] == "'" || blankChar[value[j]]) j++;
  284. value = value.substr(0, j) + this.getUrl(value.substr(j));
  285. }
  286. }
  287. // 转换 rpx
  288. else if (value.includes('rpx'))
  289. value = value.replace(/[0-9.]+\s*rpx/g, $ => parseFloat($) * windowWidth / 750 + 'px');
  290. else if (key == 'white-space' && value.includes('pre'))
  291. this.pre = node.pre = true;
  292. style += `;${key}:${value}`;
  293. }
  294. style = style.substr(1);
  295. if (style) attrs.style = style;
  296. }
  297. // 节点出栈处理
  298. popNode(node) {
  299. // 空白符处理
  300. if (node.pre) {
  301. node.pre = this.pre = void 0;
  302. for (let i = this.STACK.length; i--;)
  303. if (this.STACK[i].pre)
  304. this.pre = true;
  305. }
  306. var siblings = this.siblings(),
  307. len = siblings.length,
  308. childs = node.children;
  309. if (node.name == 'head' || (cfg.filter && cfg.filter(node, this) == false))
  310. return siblings.pop();
  311. var attrs = node.attrs;
  312. // 替换一些标签名
  313. if (cfg.blockTags[node.name]) node.name = 'div';
  314. else if (!cfg.trustTags[node.name]) node.name = 'span';
  315. // 去除块标签前后空串
  316. if (node.name == 'div' || node.name == 'p' || node.name[0] == 't') {
  317. if (len > 1 && siblings[len - 2].text == ' ')
  318. siblings.splice(--len - 1, 1);
  319. if (childs.length && childs[childs.length - 1].text == ' ')
  320. childs.pop();
  321. }
  322. // 处理列表
  323. if (node.c && (node.name == 'ul' || node.name == 'ol')) {
  324. if ((node.attrs.style || '').includes('list-style:none')) {
  325. for (let i = 0, child; child = childs[i++];)
  326. if (child.name == 'li')
  327. child.name = 'div';
  328. } else if (node.name == 'ul') {
  329. var floor = 1;
  330. for (let i = this.STACK.length; i--;)
  331. if (this.STACK[i].name == 'ul') floor++;
  332. if (floor != 1)
  333. for (let i = childs.length; i--;)
  334. childs[i].floor = floor;
  335. } else {
  336. for (let i = 0, num = 1, child; child = childs[i++];)
  337. if (child.name == 'li') {
  338. child.type = 'ol';
  339. child.num = ((num, type) => {
  340. if (type == 'a') return String.fromCharCode(97 + (num - 1) % 26);
  341. if (type == 'A') return String.fromCharCode(65 + (num - 1) % 26);
  342. if (type == 'i' || type == 'I') {
  343. num = (num - 1) % 99 + 1;
  344. var one = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'],
  345. ten = ['X', 'XX', 'XXX', 'XL', 'L', 'LX', 'LXX', 'LXXX', 'XC'],
  346. res = (ten[Math.floor(num / 10) - 1] || '') + (one[num % 10 - 1] || '');
  347. if (type == 'i') return res.toLowerCase();
  348. return res;
  349. }
  350. return num;
  351. })(num++, attrs.type) + '.';
  352. }
  353. }
  354. }
  355. // 处理表格的边框
  356. if (node.name == 'table') {
  357. var padding = attrs.cellpadding,
  358. spacing = attrs.cellspacing,
  359. border = attrs.border;
  360. if (node.c) {
  361. this.bubble();
  362. if (!padding) padding = 2;
  363. if (!spacing) spacing = 2;
  364. }
  365. if (border) attrs.style = `border:${border}px solid gray;${attrs.style || ''}`;
  366. if (spacing) attrs.style = `border-spacing:${spacing}px;${attrs.style || ''}`;
  367. if (border || padding)
  368. (function f(ns) {
  369. for (var i = 0, n; n = ns[i]; i++) {
  370. if (n.name == 'th' || n.name == 'td') {
  371. if (border) n.attrs.style = `border:${border}px solid gray;${n.attrs.style}`;
  372. if (padding) n.attrs.style = `padding:${padding}px;${n.attrs.style}`;
  373. } else f(n.children || []);
  374. }
  375. })(childs)
  376. if (this.options.autoscroll) {
  377. var table = Object.assign({}, node);
  378. node.name = 'div';
  379. node.attrs = {
  380. style: 'overflow:scroll'
  381. }
  382. node.children = [table];
  383. }
  384. }
  385. this.CssHandler.pop && this.CssHandler.pop(node);
  386. // 自动压缩
  387. if (node.name == 'div' && !Object.keys(attrs).length && childs.length == 1 && childs[0].name == 'div')
  388. siblings[len - 1] = childs[0];
  389. }
  390. // 工具函数
  391. bubble() {
  392. for (var i = this.STACK.length, item; item = this.STACK[--i];) {
  393. if (cfg.richOnlyTags[item.name]) {
  394. if (item.name == 'table' && !Object.hasOwnProperty.call(item, 'c')) item.c = 1;
  395. return false;
  396. }
  397. item.c = 1;
  398. }
  399. return true;
  400. }
  401. decode(val, amp) {
  402. var i = -1,
  403. j, en;
  404. while (1) {
  405. if ((i = val.indexOf('&', i + 1)) == -1) break;
  406. if ((j = val.indexOf(';', i + 2)) == -1) break;
  407. if (val[i + 1] == '#') {
  408. en = parseInt((val[i + 2] == 'x' ? '0' : '') + val.substring(i + 2, j));
  409. if (!isNaN(en)) val = val.substr(0, i) + String.fromCharCode(en) + val.substr(j + 1);
  410. } else {
  411. en = val.substring(i + 1, j);
  412. if (cfg.entities[en] || en == amp)
  413. val = val.substr(0, i) + (cfg.entities[en] || '&') + val.substr(j + 1);
  414. }
  415. }
  416. return val;
  417. }
  418. getUrl(url) {
  419. if (url[0] == '/') {
  420. if (url[1] == '/') url = this.options.prot + ':' + url;
  421. else if (this.domain) url = this.domain + url;
  422. } else if (this.domain && url.indexOf('data:') != 0 && !url.includes('://'))
  423. url = this.domain + '/' + url;
  424. return url;
  425. }
  426. isClose() {
  427. return this.data[this.i] == '>' || (this.data[this.i] == '/' && this.data[this.i + 1] == '>');
  428. }
  429. section() {
  430. return this.data.substring(this.start, this.i);
  431. }
  432. parent() {
  433. return this.STACK[this.STACK.length - 1];
  434. }
  435. siblings() {
  436. return this.STACK.length ? this.parent().children : this.DOM;
  437. }
  438. // 状态机
  439. Text(c) {
  440. if (c == '<') {
  441. var next = this.data[this.i + 1],
  442. isLetter = c => (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  443. if (isLetter(next)) {
  444. this.setText();
  445. this.start = this.i + 1;
  446. this.state = this.TagName;
  447. } else if (next == '/') {
  448. this.setText();
  449. if (isLetter(this.data[++this.i + 1])) {
  450. this.start = this.i + 1;
  451. this.state = this.EndTag;
  452. } else
  453. this.Comment();
  454. } else if (next == '!') {
  455. this.setText();
  456. this.Comment();
  457. }
  458. }
  459. }
  460. Comment() {
  461. var key;
  462. if (this.data.substring(this.i + 2, this.i + 4) == '--') key = '-->';
  463. else if (this.data.substring(this.i + 2, this.i + 9) == '[CDATA[') key = ']]>';
  464. else key = '>';
  465. if ((this.i = this.data.indexOf(key, this.i + 2)) == -1) this.i = this.data.length;
  466. else this.i += key.length - 1;
  467. this.start = this.i + 1;
  468. this.state = this.Text;
  469. }
  470. TagName(c) {
  471. if (blankChar[c]) {
  472. this.tagName = this.section();
  473. while (blankChar[this.data[this.i]]) this.i++;
  474. if (this.isClose()) this.setNode();
  475. else {
  476. this.start = this.i;
  477. this.state = this.AttrName;
  478. }
  479. } else if (this.isClose()) {
  480. this.tagName = this.section();
  481. this.setNode();
  482. }
  483. }
  484. AttrName(c) {
  485. var blank = blankChar[c];
  486. if (blank) {
  487. this.attrName = this.section();
  488. c = this.data[this.i];
  489. }
  490. if (c == '=') {
  491. if (!blank) this.attrName = this.section();
  492. while (blankChar[this.data[++this.i]]);
  493. this.start = this.i--;
  494. this.state = this.AttrValue;
  495. } else if (blank) this.setAttr();
  496. else if (this.isClose()) {
  497. this.attrName = this.section();
  498. this.setAttr();
  499. }
  500. }
  501. AttrValue(c) {
  502. if (c == '"' || c == "'") {
  503. this.start++;
  504. if ((this.i = this.data.indexOf(c, this.i + 1)) == -1) return this.i = this.data.length;
  505. this.attrVal = this.section();
  506. this.i++;
  507. } else {
  508. for (; !blankChar[this.data[this.i]] && !this.isClose(); this.i++);
  509. this.attrVal = this.section();
  510. }
  511. this.setAttr();
  512. }
  513. EndTag(c) {
  514. if (blankChar[c] || c == '>' || c == '/') {
  515. var name = this.section().toLowerCase();
  516. for (var i = this.STACK.length; i--;)
  517. if (this.STACK[i].name == name) break;
  518. if (i != -1) {
  519. var node;
  520. while ((node = this.STACK.pop()).name != name);
  521. this.popNode(node);
  522. } else if (name == 'p' || name == 'br')
  523. this.siblings().push({
  524. name,
  525. attrs: {}
  526. });
  527. this.i = this.data.indexOf('>', this.i);
  528. this.start = this.i + 1;
  529. if (this.i == -1) this.i = this.data.length;
  530. else this.state = this.Text;
  531. }
  532. }
  533. }
  534. module.exports = MpHtmlParser;