const qs = require('qs'); const dayjs = require('dayjs'); const request = require('request-promise'); const cheerio = require('cheerio'); const unescape = require('lodash.unescape'); const errors = require('./errors'); const defaultConfig = { shouldReturnRawMeta: false, shouldReturnContent: true, shouldFollowTransferLink: true, shouldExtractMpLinks: false, shouldExtractTags: false, shouldExtractRepostMeta: false }; function getError(code) { return { done: false, code, msg: errors[code] }; } function normalizeUrl(url = '') { const parts = url.replace(/&/g, '&').split('?'); const querys = qs.stringify(qs.parse(parts[1])); return querys ? `${parts[0]}?${querys}` : parts[0]; } function getParameterByName(name, url) { name = name.replace(/[\[\]]/g, '\\$&'); const regex = new RegExp('[?&]' + name + '(=([^&#]*)|&|#|$)'); const results = regex.exec(url); if (!results) return null; if (!results[2]) return ''; return decodeURIComponent(results[2].replace(/\+/g, ' ')); } function parseUrlParams(url) { if (!url) return {}; const rs = require('querystring').parse(url.replace(/&/g, '&').split('?')[1]); return { mid: rs.mid * 1, idx: rs.idx * 1, sn: rs.sn, biz: rs.__biz }; } async function extract(input, options = {}) { const config = Object.assign({}, defaultConfig, options); const { shouldReturnRawMeta, shouldReturnContent, shouldFollowTransferLink, shouldExtractMpLinks, shouldExtractTags, shouldExtractRepostMeta } = config; if (!input) return getError(2001); let paramType = 'HTML'; let url = options.url ? normalizeUrl(options.url) : null; let rawUrl = null; let html = input; let type = 'post'; let hasCopyright = false; // Handle URL input if (/^http/.test(input)) { const normalized = normalizeUrl(input); if (!/https?:\/\/mp\.weixin\.qq\.com/.test(normalized) && !/https?:\/\/weixin\.sogou\.com/.test(normalized)) { return getError(2009); } paramType = 'URL'; rawUrl = normalized; if (!url) url = normalized; const host = /weixin\.sogou\.com/.test(normalized) ? 'weixin.sogou.com' : 'mp.weixin.qq.com'; try { html = await request({ uri: normalized, method: 'GET', headers: { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Host': host } }); } catch (e) { return getError(1002); } } else { html = input.replace(/\\n/g, ''); } if (!html) return getError(1003); // Check for error pages if (html.includes('访问过于频繁') && !html.includes('js_content')) { return paramType === 'URL' ? getError(1004) : getError(2010); } if (html.includes('链接已过期') && !html.includes('js_content')) return getError(2002); if (html.includes('被投诉且经审核涉嫌侵权,无法查看')) return getError(2003); if (html.includes('该公众号已迁移')) { const match = html.match(/var\stransferTargetLink\s=\s'(.*?)';/); if (match && match[1]) { if (shouldFollowTransferLink) { return await extract(match[1]); } return { ...getError(1006), url: match[1] }; } return getError(2004); } if (html.includes('该内容已被发布者删除')) return getError(2005); if (html.includes('此内容因违规无法查看')) return getError(2006); if (html.includes('此内容发送失败无法查看')) return getError(2007); if (html.includes('由用户投诉并经平台审核,涉嫌过度营销')) return getError(2011); if (html.includes('此帐号已被屏蔽') && !html.includes('id="js_content"')) return getError(2012); if (html.includes('此帐号已自主注销') && !html.includes('id="js_content"')) return getError(2013); if (!html.includes('id="js_content"') && html.includes('此帐号处于帐号迁移流程中')) return getError(2015); if (html.includes('page_rumor') && !html.includes('id="js_content"')) return getError(2014); if (html.includes('投诉类型') && html.includes('冒名侵权')) return getError(2016); if (!html.includes('id="js_content"') && !html.includes('id=\\"js_content\\"')) { if (html.includes('cover_url')) { type = 'image'; } else { return getError(1000); } } // Prepare HTML html = html.replace('>微信号', ' id="append-account-alias">微信号') .replace('>功能介绍', ' id="append-account-desc">功能介绍') .replace(/\n\s+