You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

580 lines
18 KiB

3 years ago
  1. /**
  2. * html 解析器
  3. * @tutorial https://github.com/jin-yufeng/Parser
  4. * @version 20201029
  5. * @author JinYufeng
  6. * @listens MIT
  7. */
  8. const cfg = require('./config.js'),
  9. blankChar = cfg.blankChar,
  10. CssHandler = require('./CssHandler.js'),
  11. windowWidth = uni.getSystemInfoSync().windowWidth;
  12. var emoji;
  13. function MpHtmlParser(data, options = {}) {
  14. this.attrs = {};
  15. this.CssHandler = new CssHandler(options.tagStyle, windowWidth);
  16. this.data = data;
  17. this.domain = options.domain;
  18. this.DOM = [];
  19. this.i = this.start = this.audioNum = this.imgNum = this.videoNum = 0;
  20. options.prot = (this.domain || '').includes('://') ? this.domain.split('://')[0] : 'http';
  21. this.options = options;
  22. this.state = this.Text;
  23. this.STACK = [];
  24. // 工具函数
  25. this.bubble = () => {
  26. for (var i = this.STACK.length, item; item = this.STACK[--i];) {
  27. if (cfg.richOnlyTags[item.name]) return false;
  28. item.c = 1;
  29. }
  30. return true;
  31. }
  32. this.decode = (val, amp) => {
  33. var i = -1,
  34. j, en;
  35. while (1) {
  36. if ((i = val.indexOf('&', i + 1)) == -1) break;
  37. if ((j = val.indexOf(';', i + 2)) == -1) break;
  38. if (val[i + 1] == '#') {
  39. en = parseInt((val[i + 2] == 'x' ? '0' : '') + val.substring(i + 2, j));
  40. if (!isNaN(en)) val = val.substr(0, i) + String.fromCharCode(en) + val.substr(j + 1);
  41. } else {
  42. en = val.substring(i + 1, j);
  43. if (cfg.entities[en] || en == amp)
  44. val = val.substr(0, i) + (cfg.entities[en] || '&') + val.substr(j + 1);
  45. }
  46. }
  47. return val;
  48. }
  49. this.getUrl = url => {
  50. if (url[0] == '/') {
  51. if (url[1] == '/') url = this.options.prot + ':' + url;
  52. else if (this.domain) url = this.domain + url;
  53. } else if (this.domain && url.indexOf('data:') != 0 && !url.includes('://'))
  54. url = this.domain + '/' + url;
  55. return url;
  56. }
  57. this.isClose = () => this.data[this.i] == '>' || (this.data[this.i] == '/' && this.data[this.i + 1] == '>');
  58. this.section = () => this.data.substring(this.start, this.i);
  59. this.parent = () => this.STACK[this.STACK.length - 1];
  60. this.siblings = () => this.STACK.length ? this.parent().children : this.DOM;
  61. }
  62. MpHtmlParser.prototype.parse = function() {
  63. if (emoji) this.data = emoji.parseEmoji(this.data);
  64. for (var c; c = this.data[this.i]; this.i++)
  65. this.state(c);
  66. if (this.state == this.Text) this.setText();
  67. while (this.STACK.length) this.popNode(this.STACK.pop());
  68. return this.DOM;
  69. }
  70. // 设置属性
  71. MpHtmlParser.prototype.setAttr = function() {
  72. var name = this.attrName.toLowerCase(),
  73. val = this.attrVal;
  74. if (cfg.boolAttrs[name]) this.attrs[name] = 'T';
  75. else if (val) {
  76. if (name == 'src' || (name == 'data-src' && !this.attrs.src)) this.attrs.src = this.getUrl(this.decode(val, 'amp'));
  77. else if (name == 'href' || name == 'style') this.attrs[name] = this.decode(val, 'amp');
  78. else if (name.substr(0, 5) != 'data-') this.attrs[name] = val;
  79. }
  80. this.attrVal = '';
  81. while (blankChar[this.data[this.i]]) this.i++;
  82. if (this.isClose()) this.setNode();
  83. else {
  84. this.start = this.i;
  85. this.state = this.AttrName;
  86. }
  87. }
  88. // 设置文本节点
  89. MpHtmlParser.prototype.setText = function() {
  90. var back, text = this.section();
  91. if (!text) return;
  92. text = (cfg.onText && cfg.onText(text, () => back = true)) || text;
  93. if (back) {
  94. this.data = this.data.substr(0, this.start) + text + this.data.substr(this.i);
  95. let j = this.start + text.length;
  96. for (this.i = this.start; this.i < j; this.i++) this.state(this.data[this.i]);
  97. return;
  98. }
  99. if (!this.pre) {
  100. // 合并空白符
  101. var flag, tmp = [];
  102. for (let i = text.length, c; c = text[--i];)
  103. if (!blankChar[c]) {
  104. tmp.unshift(c);
  105. if (!flag) flag = 1;
  106. } else {
  107. if (tmp[0] != ' ') tmp.unshift(' ');
  108. if (c == '\n' && flag == void 0) flag = 0;
  109. }
  110. if (flag == 0) return;
  111. text = tmp.join('');
  112. }
  113. this.siblings().push({
  114. type: 'text',
  115. text: this.decode(text)
  116. });
  117. }
  118. // 设置元素节点
  119. MpHtmlParser.prototype.setNode = function() {
  120. var node = {
  121. name: this.tagName.toLowerCase(),
  122. attrs: this.attrs
  123. },
  124. close = cfg.selfClosingTags[node.name];
  125. if (this.options.nodes.length) node.type = 'node';
  126. this.attrs = {};
  127. if (!cfg.ignoreTags[node.name]) {
  128. // 处理属性
  129. var attrs = node.attrs,
  130. style = this.CssHandler.match(node.name, attrs, node) + (attrs.style || ''),
  131. styleObj = {};
  132. if (attrs.id) {
  133. if (this.options.compress & 1) attrs.id = void 0;
  134. else if (this.options.useAnchor) this.bubble();
  135. }
  136. if ((this.options.compress & 2) && attrs.class) attrs.class = void 0;
  137. switch (node.name) {
  138. case 'a':
  139. case 'ad': // #ifdef APP-PLUS
  140. case 'iframe':
  141. // #endif
  142. this.bubble();
  143. break;
  144. case 'font':
  145. if (attrs.color) {
  146. styleObj['color'] = attrs.color;
  147. attrs.color = void 0;
  148. }
  149. if (attrs.face) {
  150. styleObj['font-family'] = attrs.face;
  151. attrs.face = void 0;
  152. }
  153. if (attrs.size) {
  154. var size = parseInt(attrs.size);
  155. if (size < 1) size = 1;
  156. else if (size > 7) size = 7;
  157. var map = ['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'];
  158. styleObj['font-size'] = map[size - 1];
  159. attrs.size = void 0;
  160. }
  161. break;
  162. case 'embed':
  163. // #ifndef APP-PLUS
  164. var src = node.attrs.src || '',
  165. type = node.attrs.type || '';
  166. if (type.includes('video') || src.includes('.mp4') || src.includes('.3gp') || src.includes('.m3u8'))
  167. node.name = 'video';
  168. else if (type.includes('audio') || src.includes('.m4a') || src.includes('.wav') || src.includes('.mp3') || src.includes(
  169. '.aac'))
  170. node.name = 'audio';
  171. else break;
  172. if (node.attrs.autostart)
  173. node.attrs.autoplay = 'T';
  174. node.attrs.controls = 'T';
  175. // #endif
  176. // #ifdef APP-PLUS
  177. this.bubble();
  178. break;
  179. // #endif
  180. case 'video':
  181. case 'audio':
  182. if (!attrs.id) attrs.id = node.name + (++this[`${node.name}Num`]);
  183. else this[`${node.name}Num`]++;
  184. if (node.name == 'video') {
  185. if (this.videoNum > 3)
  186. node.lazyLoad = 1;
  187. if (attrs.width) {
  188. styleObj.width = parseFloat(attrs.width) + (attrs.width.includes('%') ? '%' : 'px');
  189. attrs.width = void 0;
  190. }
  191. if (attrs.height) {
  192. styleObj.height = parseFloat(attrs.height) + (attrs.height.includes('%') ? '%' : 'px');
  193. attrs.height = void 0;
  194. }
  195. }
  196. if (!attrs.controls && !attrs.autoplay) attrs.controls = 'T';
  197. attrs.source = [];
  198. if (attrs.src) {
  199. attrs.source.push(attrs.src);
  200. attrs.src = void 0;
  201. }
  202. this.bubble();
  203. break;
  204. case 'td':
  205. case 'th':
  206. if (attrs.colspan || attrs.rowspan)
  207. for (var k = this.STACK.length, item; item = this.STACK[--k];)
  208. if (item.name == 'table') {
  209. item.flag = 1;
  210. break;
  211. }
  212. }
  213. if (attrs.align) {
  214. if (node.name == 'table') {
  215. if (attrs.align == 'center') styleObj['margin-inline-start'] = styleObj['margin-inline-end'] = 'auto';
  216. else styleObj['float'] = attrs.align;
  217. } else styleObj['text-align'] = attrs.align;
  218. attrs.align = void 0;
  219. }
  220. // 压缩 style
  221. var styles = style.split(';');
  222. style = '';
  223. for (var i = 0, len = styles.length; i < len; i++) {
  224. var info = styles[i].split(':');
  225. if (info.length < 2) continue;
  226. let key = info[0].trim().toLowerCase(),
  227. value = info.slice(1).join(':').trim();
  228. if (value[0] == '-' || value.includes('safe'))
  229. style += `;${key}:${value}`;
  230. else if (!styleObj[key] || value.includes('import') || !styleObj[key].includes('import'))
  231. styleObj[key] = value;
  232. }
  233. if (node.name == 'img') {
  234. if (attrs.src && !attrs.ignore) {
  235. if (this.bubble())
  236. attrs.i = (this.imgNum++).toString();
  237. else attrs.ignore = 'T';
  238. }
  239. if (attrs.ignore) {
  240. style += ';-webkit-touch-callout:none';
  241. styleObj['max-width'] = '100%';
  242. }
  243. var width;
  244. if (styleObj.width) width = styleObj.width;
  245. else if (attrs.width) width = attrs.width.includes('%') ? attrs.width : parseFloat(attrs.width) + 'px';
  246. if (width) {
  247. styleObj.width = width;
  248. attrs.width = '100%';
  249. if (parseInt(width) > windowWidth) {
  250. styleObj.height = '';
  251. if (attrs.height) attrs.height = void 0;
  252. }
  253. }
  254. if (styleObj.height) {
  255. attrs.height = styleObj.height;
  256. styleObj.height = '';
  257. } else if (attrs.height && !attrs.height.includes('%'))
  258. attrs.height = parseFloat(attrs.height) + 'px';
  259. }
  260. for (var key in styleObj) {
  261. var value = styleObj[key];
  262. if (!value) continue;
  263. if (key.includes('flex') || key == 'order' || key == 'self-align') node.c = 1;
  264. // 填充链接
  265. if (value.includes('url')) {
  266. var j = value.indexOf('(');
  267. if (j++ != -1) {
  268. while (value[j] == '"' || value[j] == "'" || blankChar[value[j]]) j++;
  269. value = value.substr(0, j) + this.getUrl(value.substr(j));
  270. }
  271. }
  272. // 转换 rpx
  273. else if (value.includes('rpx'))
  274. value = value.replace(/[0-9.]+\s*rpx/g, $ => parseFloat($) * windowWidth / 750 + 'px');
  275. else if (key == 'white-space' && value.includes('pre') && !close)
  276. this.pre = node.pre = true;
  277. style += `;${key}:${value}`;
  278. }
  279. style = style.substr(1);
  280. if (style) attrs.style = style;
  281. if (!close) {
  282. node.children = [];
  283. if (node.name == 'pre' && cfg.highlight) {
  284. this.remove(node);
  285. this.pre = node.pre = true;
  286. }
  287. this.siblings().push(node);
  288. this.STACK.push(node);
  289. } else if (!cfg.filter || cfg.filter(node, this) != false)
  290. this.siblings().push(node);
  291. } else {
  292. if (!close) this.remove(node);
  293. else if (node.name == 'source') {
  294. var parent = this.parent();
  295. if (parent && (parent.name == 'video' || parent.name == 'audio') && node.attrs.src)
  296. parent.attrs.source.push(node.attrs.src);
  297. } else if (node.name == 'base' && !this.domain) this.domain = node.attrs.href;
  298. }
  299. if (this.data[this.i] == '/') this.i++;
  300. this.start = this.i + 1;
  301. this.state = this.Text;
  302. }
  303. // 移除标签
  304. MpHtmlParser.prototype.remove = function(node) {
  305. var name = node.name,
  306. j = this.i;
  307. // 处理 svg
  308. var handleSvg = () => {
  309. var src = this.data.substring(j, this.i + 1);
  310. node.attrs.xmlns = 'http://www.w3.org/2000/svg';
  311. for (var key in node.attrs) {
  312. if (key == 'viewbox') src = ` viewBox="${node.attrs.viewbox}"` + src;
  313. else if (key != 'style') src = ` ${key}="${node.attrs[key]}"` + src;
  314. }
  315. src = '<svg' + src;
  316. var parent = this.parent();
  317. if (node.attrs.width == '100%' && parent && (parent.attrs.style || '').includes('inline'))
  318. parent.attrs.style = 'width:300px;max-width:100%;' + parent.attrs.style;
  319. this.siblings().push({
  320. name: 'img',
  321. attrs: {
  322. src: 'data:image/svg+xml;utf8,' + src.replace(/#/g, '%23'),
  323. style: node.attrs.style,
  324. ignore: 'T'
  325. }
  326. })
  327. }
  328. if (node.name == 'svg' && this.data[j] == '/') return handleSvg(this.i++);
  329. while (1) {
  330. if ((this.i = this.data.indexOf('</', this.i + 1)) == -1) {
  331. if (name == 'pre' || name == 'svg') this.i = j;
  332. else this.i = this.data.length;
  333. return;
  334. }
  335. this.start = (this.i += 2);
  336. while (!blankChar[this.data[this.i]] && !this.isClose()) this.i++;
  337. if (this.section().toLowerCase() == name) {
  338. // 代码块高亮
  339. if (name == 'pre') {
  340. this.data = this.data.substr(0, j + 1) + cfg.highlight(this.data.substring(j + 1, this.i - 5), node.attrs) + this.data
  341. .substr(this.i - 5);
  342. return this.i = j;
  343. } else if (name == 'style')
  344. this.CssHandler.getStyle(this.data.substring(j + 1, this.i - 7));
  345. else if (name == 'title')
  346. this.DOM.title = this.data.substring(j + 1, this.i - 7);
  347. if ((this.i = this.data.indexOf('>', this.i)) == -1) this.i = this.data.length;
  348. if (name == 'svg') handleSvg();
  349. return;
  350. }
  351. }
  352. }
  353. // 节点出栈处理
  354. MpHtmlParser.prototype.popNode = function(node) {
  355. // 空白符处理
  356. if (node.pre) {
  357. node.pre = this.pre = void 0;
  358. for (let i = this.STACK.length; i--;)
  359. if (this.STACK[i].pre)
  360. this.pre = true;
  361. }
  362. var siblings = this.siblings(),
  363. len = siblings.length,
  364. childs = node.children;
  365. if (node.name == 'head' || (cfg.filter && cfg.filter(node, this) == false))
  366. return siblings.pop();
  367. var attrs = node.attrs;
  368. // 替换一些标签名
  369. if (cfg.blockTags[node.name]) node.name = 'div';
  370. else if (!cfg.trustTags[node.name]) node.name = 'span';
  371. // 处理列表
  372. if (node.c && (node.name == 'ul' || node.name == 'ol')) {
  373. if ((node.attrs.style || '').includes('list-style:none')) {
  374. for (let i = 0, child; child = childs[i++];)
  375. if (child.name == 'li')
  376. child.name = 'div';
  377. } else if (node.name == 'ul') {
  378. var floor = 1;
  379. for (let i = this.STACK.length; i--;)
  380. if (this.STACK[i].name == 'ul') floor++;
  381. if (floor != 1)
  382. for (let i = childs.length; i--;)
  383. childs[i].floor = floor;
  384. } else {
  385. for (let i = 0, num = 1, child; child = childs[i++];)
  386. if (child.name == 'li') {
  387. child.type = 'ol';
  388. child.num = ((num, type) => {
  389. if (type == 'a') return String.fromCharCode(97 + (num - 1) % 26);
  390. if (type == 'A') return String.fromCharCode(65 + (num - 1) % 26);
  391. if (type == 'i' || type == 'I') {
  392. num = (num - 1) % 99 + 1;
  393. var one = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'],
  394. ten = ['X', 'XX', 'XXX', 'XL', 'L', 'LX', 'LXX', 'LXXX', 'XC'],
  395. res = (ten[Math.floor(num / 10) - 1] || '') + (one[num % 10 - 1] || '');
  396. if (type == 'i') return res.toLowerCase();
  397. return res;
  398. }
  399. return num;
  400. })(num++, attrs.type) + '.';
  401. }
  402. }
  403. }
  404. // 处理表格
  405. if (node.name == 'table') {
  406. var padding = parseFloat(attrs.cellpadding),
  407. spacing = parseFloat(attrs.cellspacing),
  408. border = parseFloat(attrs.border);
  409. if (node.c) {
  410. if (isNaN(padding)) padding = 2;
  411. if (isNaN(spacing)) spacing = 2;
  412. }
  413. if (border) attrs.style = `border:${border}px solid gray;${attrs.style || ''}`;
  414. if (node.flag && node.c) {
  415. // 有 colspan 或 rowspan 且含有链接的表格转为 grid 布局实现
  416. attrs.style = `${attrs.style || ''};${spacing ? `;grid-gap:${spacing}px` : ';border-left:0;border-top:0'}`;
  417. var row = 1,
  418. col = 1,
  419. colNum,
  420. trs = [],
  421. children = [],
  422. map = {};
  423. (function f(ns) {
  424. for (var i = 0; i < ns.length; i++) {
  425. if (ns[i].name == 'tr') trs.push(ns[i]);
  426. else f(ns[i].children || []);
  427. }
  428. })(node.children)
  429. for (let i = 0; i < trs.length; i++) {
  430. for (let j = 0, td; td = trs[i].children[j]; j++) {
  431. if (td.name == 'td' || td.name == 'th') {
  432. while (map[row + '.' + col]) col++;
  433. var cell = {
  434. name: 'div',
  435. c: 1,
  436. attrs: {
  437. style: (td.attrs.style || '') + (border ? `;border:${border}px solid gray` + (spacing ? '' :
  438. ';border-right:0;border-bottom:0') : '') + (padding ? `;padding:${padding}px` : '')
  439. },
  440. children: td.children
  441. }
  442. if (td.attrs.colspan) {
  443. cell.attrs.style += ';grid-column-start:' + col + ';grid-column-end:' + (col + parseInt(td.attrs.colspan));
  444. if (!td.attrs.rowspan) cell.attrs.style += ';grid-row-start:' + row + ';grid-row-end:' + (row + 1);
  445. col += parseInt(td.attrs.colspan) - 1;
  446. }
  447. if (td.attrs.rowspan) {
  448. cell.attrs.style += ';grid-row-start:' + row + ';grid-row-end:' + (row + parseInt(td.attrs.rowspan));
  449. if (!td.attrs.colspan) cell.attrs.style += ';grid-column-start:' + col + ';grid-column-end:' + (col + 1);
  450. for (var k = 1; k < td.attrs.rowspan; k++) map[(row + k) + '.' + col] = 1;
  451. }
  452. children.push(cell);
  453. col++;
  454. }
  455. }
  456. if (!colNum) {
  457. colNum = col - 1;
  458. attrs.style += `;grid-template-columns:repeat(${colNum},auto)`
  459. }
  460. col = 1;
  461. row++;
  462. }
  463. node.children = children;
  464. } else {
  465. attrs.style = `border-spacing:${spacing}px;${attrs.style || ''}`;
  466. if (border || padding)
  467. (function f(ns) {
  468. for (var i = 0, n; n = ns[i]; i++) {
  469. if (n.name == 'th' || n.name == 'td') {
  470. if (border) n.attrs.style = `border:${border}px solid gray;${n.attrs.style || ''}`;
  471. if (padding) n.attrs.style = `padding:${padding}px;${n.attrs.style || ''}`;
  472. } else f(n.children || []);
  473. }
  474. })(childs)
  475. }
  476. if (this.options.autoscroll) {
  477. var table = Object.assign({}, node);
  478. node.name = 'div';
  479. node.attrs = {
  480. style: 'overflow:scroll'
  481. }
  482. node.children = [table];
  483. }
  484. }
  485. this.CssHandler.pop && this.CssHandler.pop(node);
  486. // 自动压缩
  487. if (node.name == 'div' && !Object.keys(attrs).length && childs.length == 1 && childs[0].name == 'div')
  488. siblings[len - 1] = childs[0];
  489. }
  490. // 状态机
  491. MpHtmlParser.prototype.Text = function(c) {
  492. if (c == '<') {
  493. var next = this.data[this.i + 1],
  494. isLetter = c => (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  495. if (isLetter(next)) {
  496. this.setText();
  497. this.start = this.i + 1;
  498. this.state = this.TagName;
  499. } else if (next == '/') {
  500. this.setText();
  501. if (isLetter(this.data[++this.i + 1])) {
  502. this.start = this.i + 1;
  503. this.state = this.EndTag;
  504. } else this.Comment();
  505. } else if (next == '!' || next == '?') {
  506. this.setText();
  507. this.Comment();
  508. }
  509. }
  510. }
  511. MpHtmlParser.prototype.Comment = function() {
  512. var key;
  513. if (this.data.substring(this.i + 2, this.i + 4) == '--') key = '-->';
  514. else if (this.data.substring(this.i + 2, this.i + 9) == '[CDATA[') key = ']]>';
  515. else key = '>';
  516. if ((this.i = this.data.indexOf(key, this.i + 2)) == -1) this.i = this.data.length;
  517. else this.i += key.length - 1;
  518. this.start = this.i + 1;
  519. this.state = this.Text;
  520. }
  521. MpHtmlParser.prototype.TagName = function(c) {
  522. if (blankChar[c]) {
  523. this.tagName = this.section();
  524. while (blankChar[this.data[this.i]]) this.i++;
  525. if (this.isClose()) this.setNode();
  526. else {
  527. this.start = this.i;
  528. this.state = this.AttrName;
  529. }
  530. } else if (this.isClose()) {
  531. this.tagName = this.section();
  532. this.setNode();
  533. }
  534. }
  535. MpHtmlParser.prototype.AttrName = function(c) {
  536. if (c == '=' || blankChar[c] || this.isClose()) {
  537. this.attrName = this.section();
  538. if (blankChar[c])
  539. while (blankChar[this.data[++this.i]]);
  540. if (this.data[this.i] == '=') {
  541. while (blankChar[this.data[++this.i]]);
  542. this.start = this.i--;
  543. this.state = this.AttrValue;
  544. } else this.setAttr();
  545. }
  546. }
  547. MpHtmlParser.prototype.AttrValue = function(c) {
  548. if (c == '"' || c == "'") {
  549. this.start++;
  550. if ((this.i = this.data.indexOf(c, this.i + 1)) == -1) return this.i = this.data.length;
  551. this.attrVal = this.section();
  552. this.i++;
  553. } else {
  554. for (; !blankChar[this.data[this.i]] && !this.isClose(); this.i++);
  555. this.attrVal = this.section();
  556. }
  557. this.setAttr();
  558. }
  559. MpHtmlParser.prototype.EndTag = function(c) {
  560. if (blankChar[c] || c == '>' || c == '/') {
  561. var name = this.section().toLowerCase();
  562. for (var i = this.STACK.length; i--;)
  563. if (this.STACK[i].name == name) break;
  564. if (i != -1) {
  565. var node;
  566. while ((node = this.STACK.pop()).name != name) this.popNode(node);
  567. this.popNode(node);
  568. } else if (name == 'p' || name == 'br')
  569. this.siblings().push({
  570. name,
  571. attrs: {}
  572. });
  573. this.i = this.data.indexOf('>', this.i);
  574. this.start = this.i + 1;
  575. if (this.i == -1) this.i = this.data.length;
  576. else this.state = this.Text;
  577. }
  578. }
  579. module.exports = MpHtmlParser;