prolerev-not-a-scrape-that-.../main.ts
2025-05-05 22:57:55 +02:00

258 lines
8.3 KiB
TypeScript

import { DOMParser } from "jsr:@b-fuze/deno-dom";
const ARTICLES_ID_MIN = 1;
const ARTICLES_ID_MAX = 94;
const downloadDate = new Date();
type ArticleInfo = {
id: number;
title: string;
dateAdded: string;
author: string;
tags: string[];
originalLink: string;
};
type Article = ArticleInfo & {
body: string;
};
async function downloadArticle(id: number): Promise<Article> {
const url =
`https://proletarianrevolution.net/index.php?page=news&type=view&id=${id}`;
console.log(`Fetching '${url}'...`);
const contentText = await fetch(url)
.then((res) => res.text());
// await Deno.writeTextFile("test_article.html", contentText);
// const contentText = await Deno.readTextFile("test_article.html");
const doc = new DOMParser().parseFromString(contentText, "text/html");
const article = doc.querySelector("article");
const title = article
?.querySelector("h1.screen_title")
?.innerText
.trim()!;
const dateAdded = article
?.querySelector("article ul.meta_details_list")
?.children[1]
.children[0]
.attributes
.getNamedItem("datetime")
?.value!;
const author = article
?.querySelector("article ul.meta_details_list")
?.children[2]
.children[1]
.innerText!;
const tags = [article]
.map((article) =>
article?.querySelectorAll(`span[itemprop="keywords"] a`)
)
.map((nodeList) => [...nodeList!])
.map((children) => children.map((aElem) => aElem.innerText))[0];
const body = article
?.querySelector(`div[itemprop="articleBody"]`)
?.innerHTML!;
return { id, title, dateAdded, author, tags, originalLink: url, body };
}
const articleTemplate = await Deno.readTextFile("templates/article.html");
function renderArticle(
article: Article,
prevId?: number,
nextId?: number,
): string {
return articleTemplate
.slice()
.replaceAll("$title", article.title)
.replaceAll("$dateAddedRaw", article.dateAdded)
.replaceAll(
"$dateAdded",
new Date(article.dateAdded).toUTCString(),
)
.replaceAll("$downloadDate", downloadDate.toUTCString())
.replaceAll("$author", article.author)
.replaceAll(
"$tags",
article.tags.map((tag) =>
`<a href="categories.html#${tag}"><span class="tag">${tag}</span></a>`
).join(
", ",
),
)
.replaceAll("$body", article.body)
.replaceAll("$originalLink", article.originalLink)
.replaceAll(
"$navPrevious",
prevId ? `<a href="article-${prevId}.html">Previous</a>` : "",
)
.replaceAll(
"$navNext",
nextId ? `<a href="article-${nextId}.html">Next</a>` : "",
);
}
const indexTemplate = await Deno.readTextFile("templates/index.html");
function renderIndex(articles: ArticleInfo[]): string {
return indexTemplate
.slice()
.replaceAll("$downloadDate", downloadDate.toUTCString())
.replaceAll(
"$articleEntries",
articles
.toSorted((a, b) => a.title.localeCompare(b.title))
.map((article) => `
<tr>
<td>
<a href="article-${article.id}.html">
${article.title}
</a>
</td>
<td>${article.author}</td>
<td class="article-date">${
new Date(article.dateAdded).toUTCString()
}</td>
<td>${(
article.tags.map((tag) => `
<a href="categories.html#${tag}">
<span class="tag">${tag}</span>
</a>
`).join(", ")
)}</td>
</tr>
`).join(""),
);
}
const datesTemplate = await Deno.readTextFile("templates/dates.html");
function renderDateIndex(articles: ArticleInfo[]): string {
return datesTemplate
.slice()
.replaceAll("$downloadDate", downloadDate.toUTCString())
.replaceAll(
"$articleEntriesByDate",
articles
.toSorted((a, b) =>
new Date(a.dateAdded).getTime() -
new Date(b.dateAdded).getTime()
)
.map((article) => `
<tr>
<td>
<a href="article-${article.id}.html">
${article.title}
</a>
</td>
<td>${article.author}</td>
<td class="article-date">${
new Date(article.dateAdded).toUTCString()
}</td>
<td>${(
article.tags.map((tag) => `
<a href="categories.html#${tag}">
<span class="tag">${tag}</span>
</a>
`).join(", ")
)}</td>
</tr>
`).join(""),
);
}
const categoriesTemplate = await Deno.readTextFile("templates/categories.html");
function renderCategories(articles: ArticleInfo[]): string {
const categories = articles
.reduce<Map<string, ArticleInfo[]>>((categories, article) => {
for (const tag of article.tags) {
if (!categories.has(tag)) {
categories.set(tag, []);
}
categories.get(tag)!.push(article);
}
return categories;
}, new Map());
return categoriesTemplate
.slice()
.replaceAll("$downloadDate", downloadDate.toUTCString())
.replaceAll(
"$categoryEntries",
categories
.entries()
.toArray()
.toSorted((a, b) => a[0].localeCompare(b[0]))
.map(([tag, articles]) => `
<div id="${tag}">
<h2>${tag}</h2>
<ul>${
articles
.toSorted((a, b) => a.title.localeCompare(b.title))
.map((article) => `
<li>
<a href="article-${article.id}.html">
${article.title}
</a>
</li>
`).join("")
}</ul>
</div>
`).join(""),
);
}
await Deno.mkdir("build/articles", { recursive: true });
const articlesIdMin = ARTICLES_ID_MIN;
const articlesIdMax = ARTICLES_ID_MAX;
const articles: ArticleInfo[] = [];
for (let id = articlesIdMin; id <= articlesIdMax; ++id) {
const prevId = id > articlesIdMin ? id - 1 : undefined;
const nextId = id < articlesIdMax ? id + 2 : undefined;
console.log(`Downloading article ${id}...`);
const article = await downloadArticle(id);
if (article.title === "Log in") {
console.log(`Article ${id} invalid. Skipping!`);
continue;
}
const html = renderArticle(article, prevId, nextId);
const filepath = `build/articles/article-${id}.html`;
await Deno.writeTextFile(filepath, html);
console.log(`Article ${id} written to '${filepath}'!`);
const { title, dateAdded, author, tags, originalLink } = article;
articles.push({ id, title, dateAdded, author, tags, originalLink });
}
console.log("Building index...");
const indexHtml = renderIndex(articles);
await Deno.writeTextFile("build/articles/index.html", indexHtml);
console.log("Index written to 'build/articles/index.html'!");
console.log("Building date index...");
const dateIndexHtml = renderDateIndex(articles);
await Deno.writeTextFile("build/articles/dates.html", dateIndexHtml);
console.log("Date index written to 'build/articles/dates.html'!");
console.log("Building categories...");
const categoriesHtml = renderCategories(articles);
await Deno.writeTextFile("build/articles/categories.html", categoriesHtml);
console.log("Categories written to 'build/articles/categories.html'!");
console.log("Done!");