website-to-remarkable/index.js

314 lines
43 KiB
JavaScript
Raw Normal View History

2020-03-22 13:51:40 +00:00
require("dotenv").config();
const http = require('http');
const fs = require("fs");
2020-04-08 10:19:26 +00:00
const axios = require('axios');
const uuid4 = require('uuid4');
2020-03-22 13:51:40 +00:00
const puppeteer = require('puppeteer');
2020-04-08 10:19:26 +00:00
const JSZip = require('jszip');
2020-03-22 13:51:40 +00:00
const server = http.createServer(async (req, res) => {
const incomingURL = new URL(`http://localhost:8000${req.url}`);
if (incomingURL.searchParams.get("website")) {
const website = new URL(incomingURL.searchParams.get("website"));
console.log(`Fetching '${website.toString()}'`);
2020-04-08 10:19:26 +00:00
let fn = sendPage;
if (website.toString().endsWith(".pdf")) {
fn = sendPDF;
}
if (await fn(website)) {
2020-03-22 13:51:40 +00:00
fs.readFile(__dirname + "/success.html", function (err,data) {
if (err) {
res.writeHead(404);
res.end(JSON.stringify(err));
return;
}
res.writeHead(200, {'Content-Type': 'text/html'});
res.end(data);
});
} else {
fs.readFile(__dirname + "/failure.html", function (err,data) {
if (err) {
res.writeHead(404);
res.end(JSON.stringify(err));
return;
}
res.writeHead(500, {'Content-Type': 'text/html'});
res.end(data);
});
}
2020-04-08 10:19:26 +00:00
} else {
2020-03-22 13:51:40 +00:00
let url = req.url === "/" ? "/index.html": req.url;
fs.readFile(__dirname + url || "/index.html", function (err,data) {
if (err) {
res.writeHead(404);
res.end(JSON.stringify(err));
return;
}
if (url.endsWith(".js")) {
res.writeHead(200, {'Content-Type': 'application/javascript'});
} else if (url.endsWith(".json")) {
res.writeHead(200, {'Content-Type': 'application/json'});
} else if (url.endsWith(".png")) {
res.writeHead(200, {'Content-Type': 'image/png'});
} else {
res.writeHead(200, {'Content-Type': 'text/html'});
}
res.end(data);
});
}
});
server.listen(8000);
2020-04-08 10:19:26 +00:00
async function sendPDF(website, tries = 0) {
try {
const response = await axios.get(website.toString(), {
responseType: 'arraybuffer'
})
const title = website.toString().substring(website.toString().lastIndexOf("/")+1, website.toString().lastIndexOf("."))
await sendToRemarkable(title, Buffer.from(response.data, 'binary'));
return true;
} catch (ex) {
console.log(ex);
if (tries < 5) {
return await sendPDF(website, ++tries);
} else {
return false;
}
}
}
2020-03-22 13:51:40 +00:00
async function sendPage(website, tries = 0) {
const browser = await puppeteer.launch({
2020-04-08 10:19:26 +00:00
ignoreHTTPSErrors: true,
2020-03-22 13:51:40 +00:00
executablePath: process.env.CHROMIUM_PATH,
args: ['--disable-dev-shm-usage', '--no-sandbox']
});
try {
const page = await browser.newPage();
2020-03-26 17:24:53 +00:00
await page.goto(website.toString(), { referer: "https://www.google.com/" });
2020-03-22 13:51:40 +00:00
const title = await page.title()
console.log("Page loaded. Title - " + title)
2020-04-08 10:19:26 +00:00
await page.addStyleTag({ content: `
body {
font-family: Georgia, serif;
font-size: 18pt;
background: none;
color: black;
text-align: left;
}
h1, h2, h3, h4, h5 {
page-break-after: avoid;
}
table, figure, ul, img {
page-break-inside: avoid;
}
a {
color: black;
}
a:after {
content: " [" attr(href) "] ";
font-size: 0.7em;
}
a[href^="#"]:after, a[href^="/"]:after {
content: "";
}
blockquote {
margin: 10px 2px;
line-height: 2em;
border: 0;
}
code {
background: none !important;
font-family: monospace;
}
ul li {
list-style: disc !important;
}
h1 {
font-size: 1.7em;
}
p {
margin-bottom: 12px;
}
header {
margin-bottom: 14px;
border-bottom: 8px solid black;
text-align: center;
}
`});
await page.evaluate(async () => {
return await new Promise(resolve => {
var REGEXPS={unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i};function isNodeVisible(node){return(!node.style||node.style.display!="none")&&!node.hasAttribute("hidden")&&(!node.hasAttribute("aria-hidden")||node.getAttribute("aria-hidden")!="true"||(node.className&&node.className.indexOf&&node.className.indexOf("fallback-image")!==-1))}function isProbablyReaderable(doc,isVisible){if(!isVisible){isVisible=isNodeVisible}var nodes=doc.querySelectorAll("p, pre");var brNodes=doc.querySelectorAll("div > br");if(brNodes.length){var set=new Set(nodes);[].forEach.call(brNodes,function(node){set.add(node.parentNode)});nodes=Array.from(set)}var score=0;return[].some.call(nodes,function(node){if(!isVisible(node)){return false}var matchString=node.className+" "+node.id;if(REGEXPS.unlikelyCandidates.test(matchString)&&!REGEXPS.okMaybeItsACandidate.test(matchString)){return false}if(node.matches("li p")){return false}var textContentLength=node.textContent.trim().length;if(textContentLength<140){return false}score+=Math.sqrt(textContentLength-140);if(score>20){return true}return false})}if(typeof exports==="object"){exports.isProbablyReaderable=isProbablyReaderable}
function Readability(doc,options){if(options&&options.documentElement){doc=options;options=arguments[2]}else if(!doc||!doc.documentElement){throw new Error("First argument to Readability constructor should be a document object.")}options=options||{};this._doc=doc;this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__;this._articleTitle=null;this._articleByline=null;this._articleDir=null;this._articleSiteName=null;this._attempts=[];this._debug=!!options.debug;this._maxElemsToParse=options.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE;this._nbTopCandidates=options.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES;this._charThreshold=options.charThreshold||this.DEFAULT_CHAR_THRESHOLD;this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve||[]);this._keepClasses=!!options.keepClasses;this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY;var logEl;if(this._debug){logEl=function(e){var rv=e.nodeName+" ";if(e.nodeType==e.TEXT_NODE){return rv+'("'+e.textContent+'")'}var classDesc=e.className&&("."+e.className.replace(/ /g,"."));var elDesc="";if(e.id){elDesc="(#"+e.id+classDesc+")"}else if(classDesc){elDesc="("+classDesc+")"}return rv+elDesc};this.log=function(){if(typeof dump!=="undefined"){var msg=Array.prototype.map.call(arguments,function(x){return(x&&x.nodeName)?logEl(x):x}).join(" ");dump("Reader: (Readability) "+msg+"\n")}else if(typeof console!=="undefined"){var args=["Reader: (Readability) "].concat(arguments);console.log.apply(console,args)}}}else{this.log=function(){}}}Readability.prototype={FLAG_STRIP_UNLIKELYS:0x1,FLAG_WEIGHT_CLASSES:0x2,FLAG_CLEAN_CONDITIONALLY:0x4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:"section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,negative:/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,extraneous:/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\/?)font[^>]*>/gi,normalize:/\s{2,}/g,videos:/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,shareElements:/(\b|_)(share|sharedaddy)(\b|_)/i,nextLink:/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,prevLink:/(prev|earl|old|new|<|«)/i,whitespace:/^\s*$/,hasContent:/\S$/},DIV_TO_P_ELEMS:["A","BLOCKQUOTE","DL","DIV","IMG","OL","P","PRE","TABLE","UL","SELECT"],ALTER_TO_DIV_EXCEPTIONS:["DIV","ARTICLE","SECTION","P"],PRESENTATIONAL_ATTRIBUTES:["align","background","bgcolor","border","cellpadding","cellspacing","frame","hspace","rules","style","valign","vspace"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:["TABLE","TH","TD","HR","PRE"],PHRASING_ELEMS:["ABBR","AUDIO","B","BDO","BR","BUTTON","CITE","CODE","DATA","DATALIST","DFN","EM","EMBED","I","IMG","INPUT","KBD","LABEL","MARK","MATH","METER","NOSCRIPT","OBJECT","OUTPUT","PROGRESS","Q","RUBY","SAMP","SCRIPT","SELECT","SMALL","SPAN","STRONG","SUB","SUP","TEXTAREA","TIME","VAR","WBR"],CLASSES_TO_PRESERVE:["page"],_postProcessContent:function(articleContent){this._fixRelativeUris(articleContent);if(!this._keepClasses){this._cleanClasses(articleContent)}},_removeNodes:function(nodeList,filterFn){if(this._docJSDOMParser&&nodeList._isLiveNodeList){throw new Error("Do not pass live node lists to _removeNodes")}for(var i=nodeList.length-1;i>=0;i-=1){var node=nodeList[i
if (isProbablyReaderable(document.cloneNode(true))) {
var documentClone = document.cloneNode(true);
var article = new Readability(documentClone).parse();
var postedDate = document.querySelector('time[datetime]');
var content = `
<header>
<h1>${article.title}</h1>
${article.byline ? `<blockquote>${article.byline}</blockquote>` : ""}
${postedDate && postedDate.getAttribute('datetime') ? `<blockquote>${postedDate.getAttribute('datetime')}</blockquote>` : ""}
</header>
` + article.content;
document.body.innerHTML = content;
2020-03-22 13:51:40 +00:00
}
2020-04-08 10:19:26 +00:00
2020-04-17 13:52:03 +00:00
[...document.querySelectorAll('details')].forEach(details => details.setAttribute('open', ''));
[...document.querySelectorAll('*')].forEach(node => {
const pos = window.getComputedStyle(node).getPropertyValue("position");
if (pos == "fixed" || pos == "sticky") {
node.style.position = "unset";
}
});
2020-04-08 10:19:26 +00:00
var im = document.createElement("img");
im.src = `https://qr.cluster.fun/?website=${window.location.toString()}`;
im.style = "position:absolute;top:0;right:0;z-index:99999999";
im.onload = resolve;
im.onerror = () => {
document.body.removeChild(im);
resolve();
}
document.body.appendChild(im);
2020-03-22 13:51:40 +00:00
})
2020-04-08 10:19:26 +00:00
});
2020-03-22 13:51:40 +00:00
2020-04-08 10:19:26 +00:00
const myPDF = await page.pdf({ format: 'A4', margin: {top: 40, bottom: 40, left: 40, right: 40} });
2020-03-22 13:51:40 +00:00
console.log("Saved to PDF")
2020-04-08 10:19:26 +00:00
await sendToRemarkable(title, myPDF);
2020-03-22 13:51:40 +00:00
return true;
} catch (ex) {
console.log(ex);
if (tries < 5) {
return await sendPage(website, ++tries);
} else {
return false;
}
} finally {
await browser.close();
}
}
2020-04-08 10:19:26 +00:00
async function sendToRemarkable(title, myPDF) {
try {
// Refresh token
let response = await axios.post(
"https://my.remarkable.com/token/json/2/user/new",
{},
{
headers: {
'Authorization': `Bearer ${process.env.REMARKABLE_TOKEN}`,
},
}
);
let token = response.data;
console.log(`Refreshed token: ${token}`);
// Get storage endpoint
response = await axios.get(
"https://service-manager-production-dot-remarkable-production.appspot.com/service/json/1/document-storage?environment=production&group=auth0%7C5a68dc51cb30df3877a1d7c4&apiVer=2",
{
headers: {
'Authorization': `Bearer ${token}`,
}
}
);
let storageHost = response.data.Host;
console.log(`Got storage host: ${storageHost}`);
// Generate upload request
const ID = uuid4();
response = await axios.put(
`https://${storageHost}/document-storage/json/2/upload/request`,
[{
"ID": ID,
"Type": "DocumentType",
"Version": 1
}],
{
headers: {
'Authorization': `Bearer ${token}`,
}
}
);
let uploadURL = response.data[0].BlobURLPut;
console.log(`Got upload URL: ${uploadURL}`);
// Build zip to upload
let zip = new JSZip();
zip.file(`${ID}.content`, JSON.stringify({
extraMetadata: {},
fileType: 'pdf',
lastOpenedPage: 0,
lineHeight: -1,
margins: 180,
pageCount: 0,
textScale: 1,
transform: {},
}));
zip.file(`${ID}.pagedata`, []);
zip.file(`${ID}.pdf`, myPDF);
const zipContent = await zip.generateAsync({ type: 'nodebuffer' });
// Upload zip
response = await axios.put(
uploadURL,
zipContent,
{
headers: {
'Content-Type': '',
'Authorization': `Bearer ${token}`,
}
}
);
console.log("Uploaded");
// Populate metadata
response = await axios.put(
`https://${storageHost}/document-storage/json/2/upload/update-status`,
[{
ID: ID,
deleted: false,
lastModified: new Date().toISOString(),
ModifiedClient: new Date().toISOString(),
metadatamodified: false,
modified: false,
parent: '',
pinned: false,
synced: true,
type: "DocumentType",
version: 1,
VissibleName: title,
}],
{
headers: {
'Authorization': `Bearer ${token}`,
}
}
);
console.log("Upload complete")
} catch (error) {
console.error(error.response);
throw error;
}
}