feat: the app

This commit is contained in:
2025-11-04 20:11:53 +01:00
commit f712c84a2e
11 changed files with 4307 additions and 0 deletions

682
server.js Normal file
View File

@@ -0,0 +1,682 @@
const express = require('express');
const cors = require('cors');
const fs = require('fs').promises;
const path = require('path');
const axios = require('axios');
const cheerio = require('cheerio');
// Lazy load puppeteer (only if needed)
let puppeteer = null;
let puppeteerAvailable = null;
async function getPuppeteer() {
if (puppeteerAvailable === false) {
return null; // Already tried and failed
}
if (!puppeteer) {
try {
puppeteer = require('puppeteer-core');
puppeteerAvailable = true;
console.log('Puppeteer-core loaded successfully');
} catch (e) {
console.warn('Puppeteer-core not available:', e.message);
puppeteerAvailable = false;
return null;
}
}
return puppeteer;
}
// Find system Chromium/Chrome executable
function findChromeExecutable() {
const { execSync } = require('child_process');
// Check environment variable first
if (process.env.CHROME_EXECUTABLE_PATH) {
return process.env.CHROME_EXECUTABLE_PATH;
}
// Try which command for common names
const commands = ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable'];
for (const cmd of commands) {
try {
const result = execSync(`which ${cmd} 2>/dev/null`, { encoding: 'utf8' }).trim();
if (result) {
return result;
}
} catch (e) {
// Continue to next command
}
}
// Try common NixOS paths
try {
const nixPaths = execSync('find /nix/store -name chromium -type f -executable 2>/dev/null | head -1', { encoding: 'utf8' }).trim();
if (nixPaths) return nixPaths;
} catch (e) {
// Ignore
}
return null;
}
const app = express();
const PORT = process.env.PORT || 3000;
const DATA_FILE = path.join(__dirname, 'data', 'links.json');
// Middleware
app.use(cors());
app.use(express.json());
app.use(express.static('public'));
// Ensure data directory exists
async function ensureDataDir() {
const dataDir = path.dirname(DATA_FILE);
try {
await fs.access(dataDir);
} catch {
await fs.mkdir(dataDir, { recursive: true });
}
try {
await fs.access(DATA_FILE);
} catch {
await fs.writeFile(DATA_FILE, JSON.stringify([]));
}
}
// Read links from file
async function readLinks() {
try {
const data = await fs.readFile(DATA_FILE, 'utf8');
return JSON.parse(data);
} catch (error) {
return [];
}
}
// Write links to file
async function writeLinks(links) {
await fs.writeFile(DATA_FILE, JSON.stringify(links, null, 2));
}
// Extract metadata using Puppeteer (for JavaScript-heavy sites)
async function extractMetadataWithPuppeteer(url) {
const pptr = await getPuppeteer();
if (!pptr) {
throw new Error('Puppeteer not available');
}
let browser = null;
try {
console.log('Launching Puppeteer browser...');
// Find system Chrome/Chromium executable
const executablePath = findChromeExecutable();
if (!executablePath) {
throw new Error('Chrome/Chromium not found. Please install it via NixOS or set CHROME_EXECUTABLE_PATH environment variable.');
}
console.log(`Using Chrome executable: ${executablePath}`);
browser = await pptr.launch({
headless: 'new',
executablePath: executablePath,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-gpu'
]
});
const page = await browser.newPage();
// Set realistic viewport and user agent
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Add extra headers to look more like a real browser
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
});
console.log(`Navigating to ${url}...`);
// Navigate to the page with longer timeout
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 60000
});
// Helper function to wait (replacement for deprecated waitForTimeout)
const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms));
// Wait a bit for any lazy-loaded content and images
console.log('Waiting for content to load...');
await wait(3000);
// Scroll a bit to trigger lazy loading
await page.evaluate(() => {
window.scrollTo(0, 300);
});
await wait(1000);
// Get the rendered HTML
console.log('Extracting HTML content...');
const html = await page.content();
await browser.close();
console.log('Browser closed, processing HTML...');
// Use the same extraction logic as the regular function
return await extractMetadataFromHTML(html, url);
} catch (error) {
if (browser) {
try {
await browser.close();
} catch (e) {
// Ignore close errors
}
}
console.error('Puppeteer extraction error:', error.message);
throw error;
}
}
// Common extraction logic that works with HTML string
async function extractMetadataFromHTML(html, url) {
const $ = cheerio.load(html);
const urlObj = new URL(url);
// Try to extract JSON-LD structured data (common in e-commerce sites)
let jsonLdData = null;
$('script[type="application/ld+json"]').each(function() {
try {
const content = $(this).html();
let jsonData = JSON.parse(content);
// Handle arrays of structured data
if (Array.isArray(jsonData)) {
jsonData = jsonData.find(item =>
item['@type'] === 'Product' ||
item['@type'] === 'WebPage' ||
item['@type'] === 'Offer'
) || jsonData[0];
}
if (jsonData && (jsonData['@type'] === 'Product' || jsonData['@type'] === 'WebPage' || jsonData['@type'] === 'Offer')) {
jsonLdData = jsonData;
return false; // break
}
} catch (e) {
// Ignore parse errors
}
});
// Extract title with priority order
let title = '';
if (jsonLdData) {
title = jsonLdData.name || jsonLdData.headline || jsonLdData.title;
}
if (!title) {
title = $('meta[property="og:title"]').attr('content') ||
$('meta[name="twitter:title"]').attr('content') ||
$('h1').first().text().trim() ||
$('title').text().trim() ||
'';
}
title = title || 'Untitled';
// Extract description with priority order
let description = '';
if (jsonLdData) {
description = jsonLdData.description || jsonLdData.about;
}
if (!description) {
description = $('meta[property="og:description"]').attr('content') ||
$('meta[name="twitter:description"]').attr('content') ||
$('meta[name="description"]').attr('content') ||
'';
}
// If still no description, try to find product description sections
if (!description) {
// Try common product description selectors
const descSelectors = [
'[data-testid="product-description"]',
'.product-description',
'.description',
'[itemprop="description"]',
'section[aria-label*="description" i]',
'section[aria-label*="beschreibung" i]' // German
];
for (const selector of descSelectors) {
const descText = $(selector).first().text().trim();
if (descText && descText.length > 20) {
description = descText;
break;
}
}
}
// Fallback to first paragraph if still no description
if (!description) {
$('p').each(function() {
const text = $(this).text().trim();
if (text.length > 50 && text.length < 1000) {
description = text;
return false; // break
}
});
}
// Extract image with multiple strategies
let image = '';
// Helper function to extract image source from an img element
const extractImgSrc = (img) => {
return img.attr('src') ||
img.attr('data-src') ||
img.attr('data-lazy-src') ||
img.attr('data-original') ||
img.attr('data-image') ||
img.attr('data-lazy') ||
img.attr('data-url');
};
// Helper function to extract best image from srcset
const extractFromSrcset = (img) => {
if (!img.attr('srcset')) return null;
const srcset = img.attr('srcset');
// Extract the largest image from srcset (usually the last one)
const srcsetMatches = srcset.match(/([^\s,]+)\s+(\d+w|\d+\.\d+x)/g);
if (srcsetMatches && srcsetMatches.length > 0) {
// Get the last entry which is usually the highest resolution
const lastMatch = srcsetMatches[srcsetMatches.length - 1];
const srcMatch = lastMatch.match(/^([^\s]+)/);
if (srcMatch) {
return srcMatch[1];
}
} else {
// Fallback: just get first URL from srcset
const srcsetMatch = srcset.match(/^([^\s,]+)/);
if (srcsetMatch) {
return srcsetMatch[1];
}
}
return null;
};
// Priority 1: Product container images (most specific - check BEFORE meta tags)
const productContainerSelectors = [
'.product-container img',
'[class*="product-container" i] img',
'#product-container img',
'.product-container picture img',
'[class*="product-container" i] picture img'
];
for (const selector of productContainerSelectors) {
const imgs = $(selector);
if (imgs.length > 0) {
// Try to find the main product image (usually the first one that's not a thumbnail)
for (let i = 0; i < imgs.length; i++) {
const img = $(imgs[i]);
const src = extractImgSrc(img);
if (src && !src.includes('thumb') && !src.includes('thumbnail') && !src.includes('icon')) {
image = extractFromSrcset(img) || src;
break;
}
}
// If no good image found, just take the first one
if (!image && imgs.length > 0) {
const firstImg = $(imgs[0]);
image = extractFromSrcset(firstImg) || extractImgSrc(firstImg);
}
if (image) break;
}
}
// Priority 2: Other product-specific containers (before meta tags)
if (!image) {
const productImageContainers = [
'[data-testid="product-image"] img',
'[data-testid="productImage"] img',
'.product-image img',
'.product-gallery img',
'.product__image img',
'.product-images img',
'[class*="product-image" i] img',
'[class*="product-gallery" i] img',
'[id*="product-image" i] img'
];
for (const selector of productImageContainers) {
const img = $(selector).first();
if (img.length) {
const imgSrc = extractImgSrc(img);
if (imgSrc) {
image = extractFromSrcset(img) || imgSrc;
if (image) break;
}
}
}
}
// Priority 3: Try Open Graph and Twitter Card images (after product containers)
if (!image) {
image = $('meta[property="og:image"]').attr('content') ||
$('meta[name="twitter:image"]').attr('content') ||
$('meta[name="twitter:image:src"]').attr('content');
}
// Priority 4: Try JSON-LD image
if (!image && jsonLdData) {
if (jsonLdData.image) {
if (typeof jsonLdData.image === 'string') {
image = jsonLdData.image;
} else if (Array.isArray(jsonLdData.image) && jsonLdData.image.length > 0) {
image = jsonLdData.image[0];
} else if (jsonLdData.image.url) {
image = jsonLdData.image.url;
}
}
}
// Priority 5: Galaxus-specific (keep existing logic)
if (!image) {
const isGalaxus = url.includes('galaxus.');
if (isGalaxus) {
const galaxusImg = $('img[alt*="Produktbild" i], img[alt*="Produkt" i]').first();
if (galaxusImg.length) {
image = extractImgSrc(galaxusImg);
}
if (!image) {
const galleryImg = $('[class*="product" i] img, [class*="image" i] img, [class*="gallery" i] img').first();
if (galleryImg.length) {
image = extractImgSrc(galleryImg);
}
}
}
}
// Priority 6: Generic product selectors
if (!image) {
const genericSelectors = [
'[itemprop="image"]',
'picture img',
'figure img',
'main img',
'[role="img"]',
'article img',
'[class*="main-image" i] img',
'[id*="main-image" i] img'
];
for (const selector of genericSelectors) {
const img = $(selector).first();
if (img.length) {
const imgSrc = extractImgSrc(img);
if (imgSrc &&
!imgSrc.includes('logo') &&
!imgSrc.includes('icon') &&
!imgSrc.includes('avatar') &&
!imgSrc.includes('spacer') &&
!imgSrc.includes('pixel')) {
image = extractFromSrcset(img) || imgSrc;
if (image) break;
}
}
}
}
// Fallback to first meaningful image
if (!image) {
$('img').each(function() {
const img = $(this);
const src = img.attr('src') ||
img.attr('data-src') ||
img.attr('data-lazy-src');
// Skip very small images, icons, and logos
if (src &&
!src.includes('logo') &&
!src.includes('icon') &&
!src.includes('avatar') &&
!src.includes('spacer') &&
!src.includes('pixel')) {
image = src;
return false; // break
}
});
}
// Convert relative URLs to absolute
if (image && !image.startsWith('http')) {
if (image.startsWith('//')) {
image = urlObj.protocol + image;
} else if (image.startsWith('/')) {
image = urlObj.origin + image;
} else {
image = new URL(image, url).href;
}
}
// Clean up title and description
title = title.trim().replace(/\s+/g, ' ');
description = description.trim().replace(/\s+/g, ' ').substring(0, 500);
return {
title: title,
description: description,
image: image
};
}
// Extract metadata from URL
async function extractMetadata(url) {
try {
const urlObj = new URL(url);
// More realistic browser headers to avoid 403 errors
const headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
'Referer': urlObj.origin + '/'
};
const response = await axios.get(url, {
headers: headers,
timeout: 20000,
maxRedirects: 5,
validateStatus: function (status) {
return status >= 200 && status < 500; // Don't throw on 403, we'll handle it
}
});
// Check if we got blocked - use Puppeteer as fallback
if (response.status === 403 || response.status === 429) {
console.log(`Received ${response.status} status, trying Puppeteer fallback...`);
const pptr = await getPuppeteer();
if (pptr) {
try {
console.log('Using Puppeteer to extract metadata...');
return await extractMetadataWithPuppeteer(url);
} catch (puppeteerError) {
console.error('Puppeteer extraction failed:', puppeteerError.message);
// Fall through to retry with simpler headers
}
}
// Fallback: try simpler headers if Puppeteer not available or failed
console.log('Trying with simpler headers...');
const retryHeaders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9'
};
const retryResponse = await axios.get(url, {
headers: retryHeaders,
timeout: 20000,
maxRedirects: 5,
validateStatus: function (status) {
return status >= 200 && status < 500;
}
});
if (retryResponse.status === 403 || retryResponse.status === 429) {
throw new Error(`Site is blocking requests. Please try again later or the site may require JavaScript rendering.`);
}
if (retryResponse.status !== 200) {
throw new Error(`Request failed with status code ${retryResponse.status}`);
}
// Use shared extraction function
return await extractMetadataFromHTML(retryResponse.data, url);
} else if (response.status !== 200) {
throw new Error(`Request failed with status code ${response.status}`);
}
// Use shared extraction function
return await extractMetadataFromHTML(response.data, url);
} catch (error) {
console.error('Error extracting metadata:', error.message);
return {
title: 'Error loading page',
description: 'Could not extract metadata from this URL',
image: ''
};
}
}
// API Routes
// Get all links
app.get('/api/links', async (req, res) => {
try {
const links = await readLinks();
res.json(links);
} catch (error) {
res.status(500).json({ error: 'Failed to read links' });
}
});
// Search links
app.get('/api/links/search', async (req, res) => {
try {
const query = req.query.q?.toLowerCase() || '';
const links = await readLinks();
if (!query) {
return res.json(links);
}
const filtered = links.filter(link => {
const titleMatch = link.title?.toLowerCase().includes(query);
const descMatch = link.description?.toLowerCase().includes(query);
const urlMatch = link.url?.toLowerCase().includes(query);
return titleMatch || descMatch || urlMatch;
});
res.json(filtered);
} catch (error) {
res.status(500).json({ error: 'Failed to search links' });
}
});
// Add a new link
app.post('/api/links', async (req, res) => {
try {
const { url } = req.body;
if (!url || !isValidUrl(url)) {
return res.status(400).json({ error: 'Invalid URL' });
}
// Check if link already exists
const links = await readLinks();
const existingLink = links.find(link => link.url === url);
if (existingLink) {
return res.status(409).json({ error: 'Link already exists' });
}
// Extract metadata
const metadata = await extractMetadata(url);
// Create new link
const newLink = {
id: Date.now().toString(),
url: url,
title: metadata.title,
description: metadata.description,
image: metadata.image,
createdAt: new Date().toISOString()
};
links.unshift(newLink); // Add to beginning
await writeLinks(links);
res.status(201).json(newLink);
} catch (error) {
console.error('Error adding link:', error);
res.status(500).json({ error: 'Failed to add link' });
}
});
// Delete a link
app.delete('/api/links/:id', async (req, res) => {
try {
const { id } = req.params;
const links = await readLinks();
const filtered = links.filter(link => link.id !== id);
if (filtered.length === links.length) {
return res.status(404).json({ error: 'Link not found' });
}
await writeLinks(filtered);
res.json({ message: 'Link deleted successfully' });
} catch (error) {
res.status(500).json({ error: 'Failed to delete link' });
}
});
// Helper function to validate URL
function isValidUrl(string) {
try {
const url = new URL(string);
return url.protocol === 'http:' || url.protocol === 'https:';
} catch (_) {
return false;
}
}
// Initialize server
async function startServer() {
await ensureDataDir();
app.listen(PORT, () => {
console.log(`LinkDing server running on http://localhost:${PORT}`);
});
}
startServer();