website-to-remarkable/index.js

462 lines
41 KiB
JavaScript
Raw Normal View History

2020-03-22 13:51:40 +00:00
require("dotenv").config();
const http = require('http');
const fs = require("fs");
2020-04-08 10:19:26 +00:00
const axios = require('axios');
const uuid4 = require('uuid4');
2020-03-22 13:51:40 +00:00
const puppeteer = require('puppeteer');
2020-04-08 10:19:26 +00:00
const JSZip = require('jszip');
2020-03-22 13:51:40 +00:00
const server = http.createServer(async (req, res) => {
const incomingURL = new URL(`http://localhost:8000${req.url}`);
if (incomingURL.searchParams.get("website")) {
const website = new URL(incomingURL.searchParams.get("website"));
console.log(`Fetching '${website.toString()}'`);
2020-04-08 10:19:26 +00:00
let fn = sendPage;
if (website.toString().endsWith(".pdf")) {
fn = sendPDF;
}
if (website.toString().endsWith(".epub")) {
fn = sendEpub;
}
2020-04-08 10:19:26 +00:00
if (await fn(website)) {
2020-03-22 13:51:40 +00:00
fs.readFile(__dirname + "/success.html", function (err,data) {
if (err) {
res.writeHead(404);
res.end(JSON.stringify(err));
return;
}
res.writeHead(200, {'Content-Type': 'text/html'});
res.end(data);
});
} else {
fs.readFile(__dirname + "/failure.html", function (err,data) {
if (err) {
res.writeHead(404);
res.end(JSON.stringify(err));
return;
}
res.writeHead(500, {'Content-Type': 'text/html'});
res.end(data);
});
}
2020-04-08 10:19:26 +00:00
} else {
2020-03-22 13:51:40 +00:00
let url = req.url === "/" ? "/index.html": req.url;
fs.readFile(__dirname + url || "/index.html", function (err,data) {
if (err) {
res.writeHead(404);
res.end(JSON.stringify(err));
return;
}
if (url.endsWith(".js")) {
res.writeHead(200, {'Content-Type': 'application/javascript'});
} else if (url.endsWith(".json")) {
res.writeHead(200, {'Content-Type': 'application/json'});
} else if (url.endsWith(".png")) {
res.writeHead(200, {'Content-Type': 'image/png'});
} else {
res.writeHead(200, {'Content-Type': 'text/html'});
}
res.end(data);
});
}
});
server.listen(8000);
2020-04-08 10:19:26 +00:00
async function sendPDF(website, tries = 0) {
try {
const response = await axios.get(website.toString(), {
responseType: 'arraybuffer'
})
2020-10-18 10:13:29 +00:00
const title = decodeURIComponent(website.toString().substring(website.toString().lastIndexOf("/")+1, website.toString().lastIndexOf(".")));
2020-04-08 10:19:26 +00:00
await sendToRemarkable(title, Buffer.from(response.data, 'binary'));
return true;
} catch (ex) {
console.log(ex);
if (tries < 5) {
return await sendPDF(website, ++tries);
} else {
return false;
}
}
}
async function sendEpub(website, tries = 0) {
try {
const response = await axios.get(website.toString(), {
responseType: 'arraybuffer'
})
2020-10-18 10:13:29 +00:00
const title = decodeURIComponent(website.toString().substring(website.toString().lastIndexOf("/")+1, website.toString().lastIndexOf(".")));
await sendToRemarkable(title, Buffer.from(response.data, 'binary'), 'epub');
return true;
} catch (ex) {
console.log(ex);
if (tries < 5) {
return await sendEpub(website, ++tries);
} else {
return false;
}
}
}
2020-03-22 13:51:40 +00:00
async function sendPage(website, tries = 0) {
const browser = await puppeteer.launch({
2020-04-08 10:19:26 +00:00
ignoreHTTPSErrors: true,
2020-03-22 13:51:40 +00:00
executablePath: process.env.CHROMIUM_PATH,
args: ['--disable-dev-shm-usage', '--no-sandbox']
});
try {
const page = await browser.newPage();
2020-03-26 17:24:53 +00:00
await page.goto(website.toString(), { referer: "https://www.google.com/" });
2020-03-22 13:51:40 +00:00
const title = await page.title()
console.log("Page loaded. Title - " + title)
2020-04-08 10:19:26 +00:00
await page.addStyleTag({ content: `
body {
font-family: Helvetica, Georgia, serif;
font-size: 20pt;
line-height: 1.2em;
2020-04-08 10:19:26 +00:00
background: none;
color: black;
text-align: left;
}
h1, h2, h3, h4, h5 {
page-break-after: avoid;
font-weight: bold;
margin-top: 4px;
}
h2, h3, h4, h5 {
padding-top: 16px;
}
b, strong {
font-weight: bold;
}
u {
text-decoration: underline;
}
i, em {
font-style: italic;
2020-04-08 10:19:26 +00:00
}
table, figure, ul, img {
page-break-inside: avoid;
}
a {
color: black;
}
a:after {
content: " [" attr(href) "] ";
font-size: 0.7em;
}
2020-06-07 16:11:03 +00:00
a[href^="#"]:after, a[href^="/"]:after, a[href^="javascript:"]:after {
2020-04-08 10:19:26 +00:00
content: "";
}
blockquote {
margin: 10px 2px;
2020-06-27 17:08:33 +00:00
line-height: 1.5em;
2020-04-08 10:19:26 +00:00
border: 0;
border-left: 8px solid grey;
padding-left: 8px;
}
table {
width: 100%;
margin: 4px;
border: 1px solid black;
}
table td, table th {
border: 1px solid black;
padding: 2px
}
table thead, table thead th {
font-weight: bold;
border-bottom-width: 2px;
2020-04-08 10:19:26 +00:00
}
code {
background: none !important;
font-family: monospace;
}
pre {
overflow: visible;
white-space: pre-wrap;
}
2020-04-08 10:19:26 +00:00
ul li {
list-style: disc !important;
margin-left: 50px;
2020-04-08 10:19:26 +00:00
}
h1 {
font-size: 1.7em;
}
p {
margin-bottom: 12px;
}
header {
margin-bottom: 14px;
border-bottom: 8px solid black;
text-align: center;
}
header blockquote {
border: 0 !important;
}
/* SCP-Wiki */
.creditRate,
.collapsible-block-folded,
.collapsible-block-unfolded-link,
2020-06-11 16:31:24 +00:00
.footer-wikiwalk-nav,
.licensebox,
.translation_block,
#u-author_block,
.u-faq,
2020-06-27 17:08:33 +00:00
.info-container,
.diamond-part,
[class*='licensebox'] {
display: none !important;
}
.collapsible-block-unfolded {
display: block !important;
}
2020-06-11 16:31:24 +00:00
.anom-bar-container {
max-width: 80% !important;
font-size: 10pt;
}
2020-06-27 17:08:33 +00:00
.anom-bar-container a:after {
content: "" !important;
}
.disrupt-class:before,
.disrupt-class:after,
.risk-class:before,
.risk-class:after,
.anom-bar-container .main-class:before,
.anom-bar-container .main-class:after {
display: none !important;
content: "" !important;
border: none !important;
}
2020-06-11 16:31:24 +00:00
2020-04-08 10:19:26 +00:00
`});
await page.evaluate(async () => {
return await new Promise(resolve => {
var REGEXPS={unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i};function isNodeVisible(a){return(!a.style||"none"!=a.style.display)&&!a.hasAttribute("hidden")&&(!a.hasAttribute("aria-hidden")||"true"!=a.getAttribute("aria-hidden")||a.className&&a.className.indexOf&&-1!==a.className.indexOf("fallback-image"))}function isProbablyReaderable(a,b){b||(b=isNodeVisible);var c=a.querySelectorAll("p, pre"),d=a.querySelectorAll("div > br");if(d.length){var e=new Set(c);[].forEach.call(d,function(a){e.add(a.parentNode)}),c=Array.from(e)}var f=0;return[].some.call(c,function(a){if(!b(a))return!1;var c=a.className+" "+a.id;if(REGEXPS.unlikelyCandidates.test(c)&&!REGEXPS.okMaybeItsACandidate.test(c))return!1;if(a.matches("li p"))return!1;var d=a.textContent.trim().length;return!(140>d)&&(f+=Math.sqrt(d-140),!!(20<f))})}"object"==typeof exports&&(exports.isProbablyReaderable=isProbablyReaderable);
function Readability(e,t){if(t&&t.documentElement)e=t,t=arguments[2];else if(!e||!e.documentElement)throw new Error("First argument to Readability constructor should be a document object.");t=t||{},this._doc=e,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!t.debug,this._maxElemsToParse=t.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=t.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=t.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(t.classesToPreserve||[]),this._keepClasses=!!t.keepClasses,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY;var a;this._debug?(a=function(t){var e=t.nodeName+" ";if(t.nodeType==t.TEXT_NODE)return e+"(\""+t.textContent+"\")";var a=t.className&&"."+t.className.replace(/ /g,"."),n="";return t.id?n="(#"+t.id+a+")":a&&(n="("+a+")"),e+n},this.log=function(){if("undefined"!=typeof dump){var e=Array.prototype.map.call(arguments,function(e){return e&&e.nodeName?a(e):e}).join(" ");dump("Reader: (Readability) "+e+"\n")}else if("undefined"!=typeof console){var t=["Reader: (Readability) "].concat(arguments);console.log.apply(console,t)}}):this.log=function(){}}Readability.prototype={FLAG_STRIP_UNLIKELYS:1,FLAG_WEIGHT_CLASSES:2,FLAG_CLEAN_CONDITIONALLY:4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:"section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow|page-content/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story|page-content/i,negative:/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,extraneous:/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\/?)font[^>]*>/gi,normalize:/\s{2,}/g,videos:/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,shareElements:/(\b|_)(share|sharedaddy)(\b|_)/i,nextLink:/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,prevLink:/(prev|earl|old|new|<|«)/i,whitespace:/^\s*$/,hasContent:/\S$/,srcsetUrl:/(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,b64DataUrl:/^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i},DIV_TO_P_ELEMS:["A","BLOCKQUOTE","DL","DIV","IMG","OL","P","PRE","TABLE","UL","SELECT"],ALTER_TO_DIV_EXCEPTIONS:["DIV","ARTICLE","SECTION","P"],PRESENTATIONAL_ATTRIBUTES:["align","background","bgcolor","border","cellpadding","cellspacing","frame","hspace","rules","style","valign","vspace"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:["TABLE","TH","TD","HR","PRE"],PHRASING_ELEMS:["ABBR","AUDIO","B","BDO","BR","BUTTON","CITE","CODE","DATA","DATALIST","DFN","EM","EMBED","I","IMG","INPUT","KBD","LABEL","MARK","MATH","METER","NOSCRIPT","OBJECT","OUTPUT","PROGRESS","Q","RUBY","SAMP","SCRIPT","SELECT","SMALL","SPAN","STRONG","SUB","SUP","TEXTAREA","TIME","VAR","WBR"],CLASSES_TO_PRESERVE:["page"],HTML_ESCAPE_MAP:{lt:"<",gt:">",amp:"&",quot:"\"",apos:"'"},_postProcessContent:function(e){this._fixRelativeUris(e),this._keepClasses||this._cleanClasses(e)},_removeNodes:function(e,t){if(this._docJSDOMParser&&e._isLiveNodeList)throw new Error("Do not pass live node lists to _removeNodes");for(var a=e.length-1;0<=a;a--){var n=e[a],l=n.parentNode;l&&(!t||t.call(this,n,a,e))&&l.removeChild(n)}},_replaceNodeTags:function(e,t){if(t
// Fix all `/...` type links
[...document.querySelectorAll('a[href^="/"]')].forEach(node => node.href = node.href);
// Remove medium blur images
[...document.querySelectorAll('img[src^="https://miro.medium.com/max/60/"]')].forEach(node => node.style.display = "none")
2020-04-08 10:19:26 +00:00
2020-09-01 12:36:35 +00:00
if (window.location.hostname.includes("scp-wiki") || window.location.hostname.includes("scpwiki")) {
2020-06-07 15:49:19 +00:00
if (document.querySelector('.collapsible-block-unfolded')) {
document.querySelector('.collapsible-block-unfolded').style.display = "block";
}
2020-06-11 16:31:24 +00:00
[...document.querySelectorAll('a.footnoteref')].forEach(ref => {
ref.innerText = document.getElementById(ref.id.replace("ref", "")).innerText;
});
2020-09-01 12:36:35 +00:00
[...document.querySelectorAll('details')].forEach(details => details.setAttribute('open', ''));
[...document.querySelectorAll('body script,body iframe,.page-rate-widget-box,.page-rate-widget-box,.footer-wikiwalk-nav,.desktop-only,.info-container,.licensebox22,#print-options,#print-head,#license-area,#container>hr,#page-info,.collapsible-block-link,.yui-nav,#toc,.creditRate,#u-credit-view,#ncmp__tool,#wiki-tab-0-1,.ncmp__normalise,img[src^="http://www.wikidot.com/avatar.php"],#odialog-hovertips,img[alt="facility-texture.png"],span.siteIcon,span.areaIcon')].forEach(node => {
node.remove();
});
2020-06-07 16:08:19 +00:00
document.body.innerHTML = `<h1>${document.getElementById('page-title').innerHTML}</h1>` + document.getElementById('page-content').innerHTML;
2020-09-15 14:36:40 +00:00
} else {
try {
if (isProbablyReaderable(document.cloneNode(true))) {
var documentClone = document.cloneNode(true);
var article = new Readability(documentClone).parse();
var postedDate = document.querySelector('time[datetime]');
var content = `
<header>
<h1>${article.title}</h1>
${article.byline ? `<blockquote>${article.byline}</blockquote>` : ""}
${postedDate && postedDate.getAttribute('datetime') ? `<blockquote>${postedDate.getAttribute('datetime')}</blockquote>` : ""}
</header>
` + article.content;
document.body.innerHTML = content;
}
} catch (ex) {
console.log("Failed to detect if readable")
}
2020-03-22 13:51:40 +00:00
}
2020-04-08 10:19:26 +00:00
2020-04-17 13:52:03 +00:00
[...document.querySelectorAll('details')].forEach(details => details.setAttribute('open', ''));
[...document.querySelectorAll('*')].forEach(node => {
const pos = window.getComputedStyle(node).getPropertyValue("position");
if (pos == "fixed" || pos == "sticky") {
node.style.position = "unset";
}
});
2020-04-08 10:19:26 +00:00
var im = document.createElement("img");
im.src = `https://qr.cluster.fun/?website=${window.location.toString()}`;
im.style = "position:absolute;top:0;right:0;z-index:99999999";
im.onload = resolve;
im.onerror = () => {
document.body.removeChild(im);
resolve();
}
document.body.appendChild(im);
2020-03-22 13:51:40 +00:00
})
2020-04-08 10:19:26 +00:00
});
2020-03-22 13:51:40 +00:00
2020-04-08 10:19:26 +00:00
const myPDF = await page.pdf({ format: 'A4', margin: {top: 40, bottom: 40, left: 40, right: 40} });
2020-03-22 13:51:40 +00:00
console.log("Saved to PDF")
2020-06-27 17:08:33 +00:00
if (process.env.DEBUG == "true") {
fs.writeFileSync(title+'.pdf', myPDF);
} else {
await sendToRemarkable(title, myPDF);
}
2020-03-22 13:51:40 +00:00
2020-06-27 18:10:03 +00:00
// If SCP, try and fetch associated tales
let scpMatch = website.toString().match(/^https?:\/\/.*scp-wiki.*\/scp-([0-9]+)$/i)
if (scpMatch) {
let scp = scpMatch[1];
let series = Number(scp[0]) + 1;
2020-08-20 03:10:45 +00:00
if (Number(scp) < 1000) {
series = 1;
}
2020-06-27 18:10:03 +00:00
await page.goto(`http://www.scp-wiki.net/scp-series-${series}-tales-edition`);
let tales = await page.$$(`a[href="/scp-${scp}"]+ul a`);
if (tales.length) {
for (let tale of tales) {
let link = await tale.evaluate(a => a.href);
sendPage(link)
}
}
}
2020-03-22 13:51:40 +00:00
return true;
} catch (ex) {
console.log(ex);
if (tries < 5) {
return await sendPage(website, ++tries);
} else {
return false;
}
} finally {
await browser.close();
}
}
2020-04-08 10:19:26 +00:00
async function sendToRemarkable(title, myPDF, fileType = "pdf") {
2020-04-08 10:19:26 +00:00
try {
// Refresh token
let response = await axios.post(
"https://my.remarkable.com/token/json/2/user/new",
{},
{
headers: {
'Authorization': `Bearer ${process.env.REMARKABLE_TOKEN}`,
},
}
);
let token = response.data;
console.log(`Refreshed token: ${token}`);
// Get storage endpoint
response = await axios.get(
"https://service-manager-production-dot-remarkable-production.appspot.com/service/json/1/document-storage?environment=production&group=auth0%7C5a68dc51cb30df3877a1d7c4&apiVer=2",
{
headers: {
'Authorization': `Bearer ${token}`,
}
}
);
let storageHost = response.data.Host;
console.log(`Got storage host: ${storageHost}`);
// Generate upload request
const ID = uuid4();
response = await axios.put(
`https://${storageHost}/document-storage/json/2/upload/request`,
[{
"ID": ID,
"Type": "DocumentType",
"Version": 1
}],
{
headers: {
'Authorization': `Bearer ${token}`,
}
}
);
let uploadURL = response.data[0].BlobURLPut;
console.log(`Got upload URL: ${uploadURL}`);
// Build zip to upload
let zip = new JSZip();
zip.file(`${ID}.content`, JSON.stringify({
extraMetadata: {},
fileType: fileType,
2020-04-08 10:19:26 +00:00
lastOpenedPage: 0,
lineHeight: -1,
margins: 180,
pageCount: 0,
textScale: 1,
transform: {},
}));
zip.file(`${ID}.pagedata`, []);
zip.file(`${ID}.${fileType}`, myPDF);
2020-04-08 10:19:26 +00:00
const zipContent = await zip.generateAsync({ type: 'nodebuffer' });
// Upload zip
response = await axios.put(
uploadURL,
zipContent,
{
headers: {
'Content-Type': '',
'Authorization': `Bearer ${token}`,
}
}
);
console.log("Uploaded");
// Populate metadata
response = await axios.put(
`https://${storageHost}/document-storage/json/2/upload/update-status`,
[{
ID: ID,
deleted: false,
lastModified: new Date().toISOString(),
ModifiedClient: new Date().toISOString(),
metadatamodified: false,
modified: false,
parent: '',
pinned: false,
synced: true,
type: "DocumentType",
version: 1,
VissibleName: title,
}],
{
headers: {
'Authorization': `Bearer ${token}`,
}
}
);
console.log("Upload complete")
} catch (error) {
console.error(error.response);
throw error;
}
}