"use node"; import { action } from "./_generated/server"; import { v } from "convex/values"; export const fetchPreview = action({ args: { url: v.string() }, returns: v.union( v.object({ url: v.string(), title: v.optional(v.string()), description: v.optional(v.string()), image: v.optional(v.string()), siteName: v.optional(v.string()), }), v.null(), ), handler: async (_ctx, args) => { try { // Validate URL + prevent loopback SSRF const u = new URL(args.url); if (u.protocol !== "http:" && u.protocol !== "https:") return null; const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), 8000); const res = await fetch(u.toString(), { method: "GET", headers: { // Discordbot User-Agent — a lot of sites (YouTube included) // only emit og: metadata when they recognise a known crawler, // and the generic Brycord UA gets routed to consent / interstitial // pages that never include the tags we're after. "User-Agent": "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }, signal: controller.signal, redirect: "follow", }); clearTimeout(timeout); if (!res.ok) return null; const contentType = res.headers.get("content-type") || ""; if (!contentType.includes("text/html")) return null; // Read up to 512 KB so giant pages don't DOS the action const reader = res.body?.getReader(); if (!reader) return null; const chunks: Uint8Array[] = []; let total = 0; const MAX = 512 * 1024; while (total < MAX) { const { value, done } = await reader.read(); if (done) break; if (value) { chunks.push(value); total += value.length; } } try { await reader.cancel(); } catch {} const merged = new Uint8Array(total); let offset = 0; for (const c of chunks) { merged.set(c, offset); offset += c.length; } const html = new TextDecoder("utf-8").decode(merged); // Parse OG / twitter / tags with regex — no DOM in Node const pick = (re: RegExp): string | undefined => { const m = html.match(re); return m ? decodeEntities(m[1].trim()) : undefined; }; const title = pick(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i) ?? pick(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i) ?? pick(/<title[^>]*>([^<]+)<\/title>/i); const description = pick(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i) ?? pick(/<meta[^>]+name=["']twitter:description["'][^>]+content=["']([^"']+)["']/i) ?? pick(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i); let image = pick(/<meta[^>]+property=["']og:image(?::secure_url)?["'][^>]+content=["']([^"']+)["']/i) ?? pick(/<meta[^>]+name=["']twitter:image(?::src)?["'][^>]+content=["']([^"']+)["']/i); const siteName = pick(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i); // Resolve relative image URLs if (image) { try { image = new URL(image, u).toString(); } catch {} } if (!title && !description && !image) return null; return { url: u.toString(), title, description, image, siteName }; } catch { return null; } }, }); function decodeEntities(s: string): string { return s .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, " ") .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(Number(n))); }