const express = require('express'); const cors = require('cors'); const fs = require('fs').promises; const path = require('path'); const axios = require('axios'); const cheerio = require('cheerio'); // Lazy load puppeteer (only if needed) let puppeteer = null; let puppeteerAvailable = null; async function getPuppeteer() { if (puppeteerAvailable === false) { return null; // Already tried and failed } if (!puppeteer) { try { puppeteer = require('puppeteer-core'); puppeteerAvailable = true; console.log('Puppeteer-core loaded successfully'); } catch (e) { console.warn('Puppeteer-core not available:', e.message); puppeteerAvailable = false; return null; } } return puppeteer; } // Find system Chromium/Chrome executable function findChromeExecutable() { const { execSync } = require('child_process'); // Check environment variable first if (process.env.CHROME_EXECUTABLE_PATH) { return process.env.CHROME_EXECUTABLE_PATH; } // Try which command for common names const commands = ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable']; for (const cmd of commands) { try { const result = execSync(`which ${cmd} 2>/dev/null`, { encoding: 'utf8' }).trim(); if (result) { return result; } } catch (e) { // Continue to next command } } // Try common NixOS paths try { const nixPaths = execSync('find /nix/store -name chromium -type f -executable 2>/dev/null | head -1', { encoding: 'utf8' }).trim(); if (nixPaths) return nixPaths; } catch (e) { // Ignore } return null; } const app = express(); const PORT = process.env.PORT || 3000; const DATA_FILE = path.join(__dirname, 'data', 'links.json'); const LISTS_FILE = path.join(__dirname, 'data', 'lists.json'); // Middleware app.use(cors()); app.use(express.json()); app.use(express.static('public')); // Ensure data directory exists async function ensureDataDir() { const dataDir = path.dirname(DATA_FILE); try { await fs.access(dataDir); } catch { await fs.mkdir(dataDir, { recursive: true }); } try { await fs.access(DATA_FILE); } catch { await fs.writeFile(DATA_FILE, JSON.stringify([])); } try { await fs.access(LISTS_FILE); } catch { await fs.writeFile(LISTS_FILE, JSON.stringify([])); } } // Read links from file async function readLinks() { try { const data = await fs.readFile(DATA_FILE, 'utf8'); return JSON.parse(data); } catch (error) { return []; } } // Write links to file async function writeLinks(links) { await fs.writeFile(DATA_FILE, JSON.stringify(links, null, 2)); } // Read lists from file async function readLists() { try { const data = await fs.readFile(LISTS_FILE, 'utf8'); return JSON.parse(data); } catch (error) { return []; } } // Write lists to file async function writeLists(lists) { await fs.writeFile(LISTS_FILE, JSON.stringify(lists, null, 2)); } // Extract metadata using Puppeteer (for JavaScript-heavy sites) async function extractMetadataWithPuppeteer(url) { const pptr = await getPuppeteer(); if (!pptr) { throw new Error('Puppeteer not available'); } let browser = null; try { console.log('Launching Puppeteer browser...'); // Find system Chrome/Chromium executable const executablePath = findChromeExecutable(); if (!executablePath) { throw new Error('Chrome/Chromium not found. Please install it via NixOS or set CHROME_EXECUTABLE_PATH environment variable.'); } console.log(`Using Chrome executable: ${executablePath}`); browser = await pptr.launch({ headless: 'new', executablePath: executablePath, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', '--disable-features=IsolateOrigins,site-per-process', '--disable-gpu' ] }); const page = await browser.newPage(); // Set realistic viewport and user agent await page.setViewport({ width: 1920, height: 1080 }); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); // Add extra headers to look more like a real browser await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9,de;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }); console.log(`Navigating to ${url}...`); // Navigate to the page with longer timeout await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); // Helper function to wait (replacement for deprecated waitForTimeout) const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms)); // Wait a bit for any lazy-loaded content and images console.log('Waiting for content to load...'); await wait(3000); // Scroll a bit to trigger lazy loading await page.evaluate(() => { window.scrollTo(0, 300); }); await wait(1000); // Get the rendered HTML console.log('Extracting HTML content...'); const html = await page.content(); await browser.close(); console.log('Browser closed, processing HTML...'); // Use the same extraction logic as the regular function return await extractMetadataFromHTML(html, url); } catch (error) { if (browser) { try { await browser.close(); } catch (e) { // Ignore close errors } } console.error('Puppeteer extraction error:', error.message); throw error; } } // Common extraction logic that works with HTML string async function extractMetadataFromHTML(html, url) { const $ = cheerio.load(html); const urlObj = new URL(url); // Try to extract JSON-LD structured data (common in e-commerce sites) let jsonLdData = null; $('script[type="application/ld+json"]').each(function() { try { const content = $(this).html(); let jsonData = JSON.parse(content); // Handle arrays of structured data if (Array.isArray(jsonData)) { jsonData = jsonData.find(item => item['@type'] === 'Product' || item['@type'] === 'WebPage' || item['@type'] === 'Offer' ) || jsonData[0]; } if (jsonData && (jsonData['@type'] === 'Product' || jsonData['@type'] === 'WebPage' || jsonData['@type'] === 'Offer')) { jsonLdData = jsonData; return false; // break } } catch (e) { // Ignore parse errors } }); // Extract title with priority order let title = ''; if (jsonLdData) { title = jsonLdData.name || jsonLdData.headline || jsonLdData.title; } if (!title) { title = $('meta[property="og:title"]').attr('content') || $('meta[name="twitter:title"]').attr('content') || $('h1').first().text().trim() || $('title').text().trim() || ''; } title = title || 'Untitled'; // Extract description with priority order let description = ''; if (jsonLdData) { description = jsonLdData.description || jsonLdData.about; } if (!description) { description = $('meta[property="og:description"]').attr('content') || $('meta[name="twitter:description"]').attr('content') || $('meta[name="description"]').attr('content') || ''; } // If still no description, try to find product description sections if (!description) { // Try common product description selectors const descSelectors = [ '[data-testid="product-description"]', '.product-description', '.description', '[itemprop="description"]', 'section[aria-label*="description" i]', 'section[aria-label*="beschreibung" i]' // German ]; for (const selector of descSelectors) { const descText = $(selector).first().text().trim(); if (descText && descText.length > 20) { description = descText; break; } } } // Fallback to first paragraph if still no description if (!description) { $('p').each(function() { const text = $(this).text().trim(); if (text.length > 50 && text.length < 1000) { description = text; return false; // break } }); } // Extract image with multiple strategies let image = ''; // Helper function to extract image source from an img element const extractImgSrc = (img) => { return img.attr('src') || img.attr('data-src') || img.attr('data-lazy-src') || img.attr('data-original') || img.attr('data-image') || img.attr('data-lazy') || img.attr('data-url'); }; // Helper function to extract best image from srcset const extractFromSrcset = (img) => { if (!img.attr('srcset')) return null; const srcset = img.attr('srcset'); // Extract the largest image from srcset (usually the last one) const srcsetMatches = srcset.match(/([^\s,]+)\s+(\d+w|\d+\.\d+x)/g); if (srcsetMatches && srcsetMatches.length > 0) { // Get the last entry which is usually the highest resolution const lastMatch = srcsetMatches[srcsetMatches.length - 1]; const srcMatch = lastMatch.match(/^([^\s]+)/); if (srcMatch) { return srcMatch[1]; } } else { // Fallback: just get first URL from srcset const srcsetMatch = srcset.match(/^([^\s,]+)/); if (srcsetMatch) { return srcsetMatch[1]; } } return null; }; // Priority 1: Product container images (most specific - check BEFORE meta tags) const productContainerSelectors = [ '.product-container img', '[class*="product-container" i] img', '#product-container img', '.product-container picture img', '[class*="product-container" i] picture img' ]; for (const selector of productContainerSelectors) { const imgs = $(selector); if (imgs.length > 0) { // Try to find the main product image (usually the first one that's not a thumbnail) for (let i = 0; i < imgs.length; i++) { const img = $(imgs[i]); const src = extractImgSrc(img); if (src && !src.includes('thumb') && !src.includes('thumbnail') && !src.includes('icon')) { image = extractFromSrcset(img) || src; break; } } // If no good image found, just take the first one if (!image && imgs.length > 0) { const firstImg = $(imgs[0]); image = extractFromSrcset(firstImg) || extractImgSrc(firstImg); } if (image) break; } } // Priority 2: Other product-specific containers (before meta tags) if (!image) { const productImageContainers = [ '[data-testid="product-image"] img', '[data-testid="productImage"] img', '.product-image img', '.product-gallery img', '.product__image img', '.product-images img', '[class*="product-image" i] img', '[class*="product-gallery" i] img', '[id*="product-image" i] img' ]; for (const selector of productImageContainers) { const img = $(selector).first(); if (img.length) { const imgSrc = extractImgSrc(img); if (imgSrc) { image = extractFromSrcset(img) || imgSrc; if (image) break; } } } } // Priority 3: Try Open Graph and Twitter Card images (after product containers) if (!image) { image = $('meta[property="og:image"]').attr('content') || $('meta[name="twitter:image"]').attr('content') || $('meta[name="twitter:image:src"]').attr('content'); } // Priority 4: Try JSON-LD image if (!image && jsonLdData) { if (jsonLdData.image) { if (typeof jsonLdData.image === 'string') { image = jsonLdData.image; } else if (Array.isArray(jsonLdData.image) && jsonLdData.image.length > 0) { image = jsonLdData.image[0]; } else if (jsonLdData.image.url) { image = jsonLdData.image.url; } } } // Priority 5: Galaxus-specific (keep existing logic) if (!image) { const isGalaxus = url.includes('galaxus.'); if (isGalaxus) { const galaxusImg = $('img[alt*="Produktbild" i], img[alt*="Produkt" i]').first(); if (galaxusImg.length) { image = extractImgSrc(galaxusImg); } if (!image) { const galleryImg = $('[class*="product" i] img, [class*="image" i] img, [class*="gallery" i] img').first(); if (galleryImg.length) { image = extractImgSrc(galleryImg); } } } } // Priority 6: Generic product selectors if (!image) { const genericSelectors = [ '[itemprop="image"]', 'picture img', 'figure img', 'main img', '[role="img"]', 'article img', '[class*="main-image" i] img', '[id*="main-image" i] img' ]; for (const selector of genericSelectors) { const img = $(selector).first(); if (img.length) { const imgSrc = extractImgSrc(img); if (imgSrc && !imgSrc.includes('logo') && !imgSrc.includes('icon') && !imgSrc.includes('avatar') && !imgSrc.includes('spacer') && !imgSrc.includes('pixel')) { image = extractFromSrcset(img) || imgSrc; if (image) break; } } } } // Fallback to first meaningful image if (!image) { $('img').each(function() { const img = $(this); const src = img.attr('src') || img.attr('data-src') || img.attr('data-lazy-src'); // Skip very small images, icons, and logos if (src && !src.includes('logo') && !src.includes('icon') && !src.includes('avatar') && !src.includes('spacer') && !src.includes('pixel')) { image = src; return false; // break } }); } // Convert relative URLs to absolute if (image && !image.startsWith('http')) { if (image.startsWith('//')) { image = urlObj.protocol + image; } else if (image.startsWith('/')) { image = urlObj.origin + image; } else { image = new URL(image, url).href; } } // Clean up title and description title = title.trim().replace(/\s+/g, ' '); description = description.trim().replace(/\s+/g, ' ').substring(0, 500); return { title: title, description: description, image: image }; } // Extract metadata from URL async function extractMetadata(url) { try { const urlObj = new URL(url); // More realistic browser headers to avoid 403 errors const headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9,de;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Cache-Control': 'max-age=0', 'Referer': urlObj.origin + '/' }; const response = await axios.get(url, { headers: headers, timeout: 20000, maxRedirects: 5, validateStatus: function (status) { return status >= 200 && status < 500; // Don't throw on 403, we'll handle it } }); // Check if we got blocked - use Puppeteer as fallback if (response.status === 403 || response.status === 429) { console.log(`Received ${response.status} status, trying Puppeteer fallback...`); const pptr = await getPuppeteer(); if (pptr) { try { console.log('Using Puppeteer to extract metadata...'); return await extractMetadataWithPuppeteer(url); } catch (puppeteerError) { console.error('Puppeteer extraction failed:', puppeteerError.message); // Fall through to retry with simpler headers } } // Fallback: try simpler headers if Puppeteer not available or failed console.log('Trying with simpler headers...'); const retryHeaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9' }; const retryResponse = await axios.get(url, { headers: retryHeaders, timeout: 20000, maxRedirects: 5, validateStatus: function (status) { return status >= 200 && status < 500; } }); if (retryResponse.status === 403 || retryResponse.status === 429) { throw new Error(`Site is blocking requests. Please try again later or the site may require JavaScript rendering.`); } if (retryResponse.status !== 200) { throw new Error(`Request failed with status code ${retryResponse.status}`); } // Use shared extraction function return await extractMetadataFromHTML(retryResponse.data, url); } else if (response.status !== 200) { throw new Error(`Request failed with status code ${response.status}`); } // Use shared extraction function return await extractMetadataFromHTML(response.data, url); } catch (error) { console.error('Error extracting metadata:', error.message); return { title: 'Error loading page', description: 'Could not extract metadata from this URL', image: '' }; } } // API Routes // Get all links app.get('/api/links', async (req, res) => { try { const links = await readLinks(); res.json(links); } catch (error) { res.status(500).json({ error: 'Failed to read links' }); } }); // Search links app.get('/api/links/search', async (req, res) => { try { const query = req.query.q?.toLowerCase() || ''; const links = await readLinks(); if (!query) { return res.json(links); } const filtered = links.filter(link => { const titleMatch = link.title?.toLowerCase().includes(query); const descMatch = link.description?.toLowerCase().includes(query); const urlMatch = link.url?.toLowerCase().includes(query); return titleMatch || descMatch || urlMatch; }); res.json(filtered); } catch (error) { res.status(500).json({ error: 'Failed to search links' }); } }); // Add a new link app.post('/api/links', async (req, res) => { try { const { url } = req.body; if (!url || !isValidUrl(url)) { return res.status(400).json({ error: 'Invalid URL' }); } // Check if link already exists const links = await readLinks(); const existingLink = links.find(link => link.url === url); if (existingLink) { return res.status(409).json({ error: 'Link already exists' }); } // Extract metadata const metadata = await extractMetadata(url); // Create new link const newLink = { id: Date.now().toString(), url: url, title: metadata.title, description: metadata.description, image: metadata.image, createdAt: new Date().toISOString() }; links.unshift(newLink); // Add to beginning await writeLinks(links); res.status(201).json(newLink); } catch (error) { console.error('Error adding link:', error); res.status(500).json({ error: 'Failed to add link' }); } }); // Archive/Unarchive a link app.patch('/api/links/:id/archive', async (req, res) => { try { const { id } = req.params; const { archived } = req.body; if (typeof archived !== 'boolean') { return res.status(400).json({ error: 'archived must be a boolean' }); } const links = await readLinks(); const linkIndex = links.findIndex(link => link.id === id); if (linkIndex === -1) { return res.status(404).json({ error: 'Link not found' }); } links[linkIndex].archived = archived; await writeLinks(links); res.json(links[linkIndex]); } catch (error) { res.status(500).json({ error: 'Failed to update link' }); } }); // Delete a link app.delete('/api/links/:id', async (req, res) => { try { const { id } = req.params; const links = await readLinks(); const filtered = links.filter(link => link.id !== id); if (filtered.length === links.length) { return res.status(404).json({ error: 'Link not found' }); } await writeLinks(filtered); res.json({ message: 'Link deleted successfully' }); } catch (error) { res.status(500).json({ error: 'Failed to delete link' }); } }); // Update link's lists app.patch('/api/links/:id/lists', async (req, res) => { try { const { id } = req.params; const { listIds } = req.body; if (!Array.isArray(listIds)) { return res.status(400).json({ error: 'listIds must be an array' }); } const links = await readLinks(); const linkIndex = links.findIndex(link => link.id === id); if (linkIndex === -1) { return res.status(404).json({ error: 'Link not found' }); } links[linkIndex].listIds = listIds; await writeLinks(links); res.json(links[linkIndex]); } catch (error) { res.status(500).json({ error: 'Failed to update link lists' }); } }); // Lists API Routes // Get all lists app.get('/api/lists', async (req, res) => { try { const lists = await readLists(); res.json(lists); } catch (error) { res.status(500).json({ error: 'Failed to read lists' }); } }); // Create a new list app.post('/api/lists', async (req, res) => { try { const { name } = req.body; if (!name || typeof name !== 'string' || name.trim().length === 0) { return res.status(400).json({ error: 'List name is required' }); } const lists = await readLists(); // Check if list with same name already exists const existingList = lists.find(list => list.name.toLowerCase() === name.trim().toLowerCase()); if (existingList) { return res.status(409).json({ error: 'List with this name already exists' }); } const newList = { id: Date.now().toString(), name: name.trim(), createdAt: new Date().toISOString(), public: false }; lists.push(newList); await writeLists(lists); res.status(201).json(newList); } catch (error) { res.status(500).json({ error: 'Failed to create list' }); } }); // Update a list app.put('/api/lists/:id', async (req, res) => { try { const { id } = req.params; const { name } = req.body; if (!name || typeof name !== 'string' || name.trim().length === 0) { return res.status(400).json({ error: 'List name is required' }); } const lists = await readLists(); const listIndex = lists.findIndex(list => list.id === id); if (listIndex === -1) { return res.status(404).json({ error: 'List not found' }); } // Check if another list with same name exists const existingList = lists.find(list => list.id !== id && list.name.toLowerCase() === name.trim().toLowerCase()); if (existingList) { return res.status(409).json({ error: 'List with this name already exists' }); } lists[listIndex].name = name.trim(); await writeLists(lists); res.json(lists[listIndex]); } catch (error) { res.status(500).json({ error: 'Failed to update list' }); } }); // Toggle list public status app.patch('/api/lists/:id/public', async (req, res) => { try { const { id } = req.params; const { public: isPublic } = req.body; if (typeof isPublic !== 'boolean') { return res.status(400).json({ error: 'public must be a boolean' }); } const lists = await readLists(); const listIndex = lists.findIndex(list => list.id === id); if (listIndex === -1) { return res.status(404).json({ error: 'List not found' }); } lists[listIndex].public = isPublic; await writeLists(lists); res.json(lists[listIndex]); } catch (error) { res.status(500).json({ error: 'Failed to update list public status' }); } }); // Delete a list app.delete('/api/lists/:id', async (req, res) => { try { const { id } = req.params; const lists = await readLists(); const filtered = lists.filter(list => list.id !== id); if (filtered.length === lists.length) { return res.status(404).json({ error: 'List not found' }); } // Remove this list from all links const links = await readLinks(); links.forEach(link => { if (link.listIds && Array.isArray(link.listIds)) { link.listIds = link.listIds.filter(listId => listId !== id); } }); await writeLinks(links); await writeLists(filtered); res.json({ message: 'List deleted successfully' }); } catch (error) { res.status(500).json({ error: 'Failed to delete list' }); } }); // Helper function to validate URL function isValidUrl(string) { try { const url = new URL(string); return url.protocol === 'http:' || url.protocol === 'https:'; } catch (_) { return false; } } // Initialize server async function startServer() { await ensureDataDir(); app.listen(PORT, () => { console.log(`LinkDing server running on http://localhost:${PORT}`); }); } startServer();