get thumbnails from html images (#1100)

This commit is contained in:
Raicuparta 2025-07-25 04:57:35 +02:00 committed by GitHub
parent 53f7617eae
commit a8392abd0f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 1927 additions and 1401 deletions

View File

@ -28,7 +28,7 @@ jobs:
- uses: pnpm/action-setup@v2
with:
version: 8.6.10
version: 9.4.0
run_install: true
- name: Build

View File

@ -44,7 +44,7 @@ jobs:
- uses: pnpm/action-setup@v4
with:
version: 8.6.10
version: 9.4.0
run_install: true
- name: Build

View File

@ -13,7 +13,7 @@
"lint": "eslint src"
},
"dependencies": {
"@actions/core": "^1.10.1",
"@actions/core": "^1.11.1",
"@google-analytics/data": "^3.3.0",
"@octokit/action": "^4.0.10",
"@octokit/core": "^4.2.4",
@ -21,15 +21,16 @@
"@octokit/plugin-throttling": "^4.3.2",
"@octokit/request-error": "^3.0.3",
"@octokit/types": "^9.3.2",
"@types/commonmark": "^0.27.9",
"@types/node": "^18.19.34",
"@types/commonmark": "^0.27.10",
"@types/node": "^18.19.120",
"@types/sharp": "^0.31.1",
"@typescript-eslint/eslint-plugin": "^5.62.0",
"@typescript-eslint/parser": "^5.62.0",
"commonmark": "^0.30.0",
"eslint": "^8.57.0",
"eslint": "^8.57.1",
"htmlparser2": "^10.0.0",
"node-fetch": "^3.3.2",
"sharp": "^0.33.4",
"sharp": "^0.33.5",
"typescript": "^4.9.5"
},
"packageManager": "pnpm@9.4.0+sha512.f549b8a52c9d2b8536762f99c0722205efc5af913e77835dbccc3b0b0b2ca9e7dc8022b78062c17291c48e88749c70ce88eb5a74f1fa8c4bf5e18bb46c8bd83a"

3258
scripts/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,7 @@ import path from "path";
import fetch from "node-fetch";
import { getReadmeMarkdown } from "./readmes.js";
import { GITHUB_RAW_CONTENT_URL } from "../constants.js";
import { Parser as HtmlParser } from "htmlparser2";
export const thumbnailSize = {
width: 450,
@ -111,6 +112,32 @@ function tryGetUrl(url: string): URL | null {
}
}
function extractImageUrlFromHtml(html: string): string | null {
try {
let imageUrl: string | null = null;
const parser = new HtmlParser(
{
onopentag(name, attribs) {
if (name === "img" && !imageUrl) {
imageUrl = attribs.src || null;
parser.pause();
}
},
},
{ decodeEntities: true }
);
parser.write(html);
parser.end();
return imageUrl;
} catch (error) {
console.error("Failed to parse HTML with DOMParser:", error);
return null;
}
}
export async function getFirstImageUrl(
readmeUrl: string | undefined
): Promise<string | null> {
@ -126,22 +153,34 @@ export async function getFirstImageUrl(
let event;
while ((event = walker.next())) {
const node = event.node;
if (node.type !== "image" || !node.destination) continue;
const imageUrl = tryGetUrl(node.destination);
let imageUrl: string | null = null;
if (node.type === "image" && node.destination) {
imageUrl = node.destination;
} else if (
(node.type === "html_inline" || node.type === "html_block") &&
node.literal
) {
imageUrl = extractImageUrlFromHtml(node.literal);
}
if (!imageUrl) continue;
const parsedImageUrl = tryGetUrl(imageUrl);
if (
!imageUrl?.pathname.endsWith(".svg") &&
imageUrl?.host !== "img.shields.io"
!parsedImageUrl?.pathname.endsWith(".svg") &&
parsedImageUrl?.host !== "img.shields.io"
) {
const fullUrl = imageUrl
const fullUrl = parsedImageUrl
? // GitHub allows embedding images that actually point to webpages on github.com, so we have to replace the URLs here
node.destination.replace(
imageUrl.replace(
/^https?:\/\/github.com\/(.+)\/(.+)\/blob\/(.+)\//gm,
`${GITHUB_RAW_CONTENT_URL}/$1/$2/$3/`
)
: // For relative URLs we also have to resolve them
`${baseUrl}/${node.destination}`;
`${baseUrl}/${imageUrl}`;
return fullUrl;
}
@ -176,7 +215,7 @@ async function downloadImage(
const fullImagePath = getPath(relativeImagePath);
const image = await response.arrayBuffer();
await fsp.writeFile(fullImagePath, Buffer.from(image));
await fsp.writeFile(fullImagePath, new Uint8Array(image));
console.log(`Downloaded image from ${imageUrl} to ${fullImagePath}`);
return fullImagePath;