feat: new cluster forming protocoll

This commit is contained in:
2025-10-19 12:51:31 +02:00
parent b6b55c0a6f
commit d166b0b634
3 changed files with 149 additions and 139 deletions

View File

@@ -49,7 +49,7 @@ The backend now includes automatic UDP discovery for SPORE nodes on the network.
### 🚀 How It Works ### 🚀 How It Works
1. **UDP Server**: The backend listens on port 4210 for UDP messages 1. **UDP Server**: The backend listens on port 4210 for UDP messages
2. **Discovery Message**: Nodes send `CLUSTER_DISCOVERY` messages to broadcast address `255.255.255.255:4210` 2. **Heartbeat Message**: Nodes send `CLUSTER_HEARTBEAT` messages to broadcast address `255.255.255.255:4210`
3. **Auto Configuration**: When a discovery message is received, the source IP is automatically used to configure the SporeApiClient 3. **Auto Configuration**: When a discovery message is received, the source IP is automatically used to configure the SporeApiClient
4. **Dynamic Updates**: The system automatically switches to the most recently seen node as the primary connection 4. **Dynamic Updates**: The system automatically switches to the most recently seen node as the primary connection
5. **Health Monitoring**: Continuous monitoring of node availability with automatic failover 5. **Health Monitoring**: Continuous monitoring of node availability with automatic failover
@@ -57,7 +57,7 @@ The backend now includes automatic UDP discovery for SPORE nodes on the network.
### 📡 Discovery Protocol ### 📡 Discovery Protocol
- **Port**: 4210 (configurable via `UDP_PORT` constant) - **Port**: 4210 (configurable via `UDP_PORT` constant)
- **Message**: `CLUSTER_DISCOVERY` (configurable via `DISCOVERY_MESSAGE` constant) - **Message**: `CLUSTER_HEARTBEAT` (configurable via `HEARTBEAT_MESSAGE` constant)
- **Broadcast**: `255.255.255.255:4210` - **Broadcast**: `255.255.255.255:4210`
- **Protocol**: UDP broadcast listening - **Protocol**: UDP broadcast listening
- **Auto-binding**: Automatically binds to the specified port on startup - **Auto-binding**: Automatically binds to the specified port on startup
@@ -72,27 +72,27 @@ npm start
# The server will automatically: # The server will automatically:
# - Start HTTP server on port 3001 # - Start HTTP server on port 3001
# - Start UDP discovery server on port 4210 # - Start UDP discovery server on port 4210
# - Wait for CLUSTER_DISCOVERY messages # - Wait for CLUSTER_HEARTBEAT messages
``` ```
#### Node Configuration #### Node Configuration
SPORE nodes should send discovery messages periodically: SPORE nodes should send heartbeat messages periodically:
```bash ```bash
# Recommended: Send every 30-60 seconds # Recommended: Send every 30-60 seconds
# Message format: "CLUSTER_DISCOVERY" # Message format: "CLUSTER_HEARTBEAT:hostname"
# Target: 255.255.255.255:4210 # Target: 255.255.255.255:4210
``` ```
### 🌐 Discovery Endpoints ### 🌐 Cluster Endpoints
#### Discovery Management #### Cluster Management
- `GET /api/discovery/nodes` - View all discovered nodes and current status - `GET /api/discovery/nodes` - View all cluster nodes and current status
- `POST /api/discovery/refresh` - Manually trigger discovery refresh - `POST /api/discovery/refresh` - Manually trigger cluster refresh
- `POST /api/discovery/primary/:ip` - Manually set a specific node as primary - `POST /api/discovery/primary/:ip` - Manually set a specific node as primary
- `POST /api/discovery/random-primary` - Randomly select a new primary node - `POST /api/discovery/random-primary` - Randomly select a new primary node
#### Health Monitoring #### Health Monitoring
- `GET /api/health` - Comprehensive health check including discovery status - `GET /api/health` - Comprehensive health check including cluster status
### 🧪 Testing & Development ### 🧪 Testing & Development

264
index.js
View File

@@ -41,12 +41,11 @@ app.use(cors({
allowedHeaders: ['Content-Type', 'Authorization'] allowedHeaders: ['Content-Type', 'Authorization']
})); }));
// UDP discovery configuration // UDP heartbeat-only configuration
const UDP_PORT = 4210; const UDP_PORT = 4210;
const DISCOVERY_MESSAGE = 'CLUSTER_DISCOVERY'; const STALE_THRESHOLD_SECONDS = 8; // 8 seconds to accommodate 5-second heartbeat interval
const STALE_THRESHOLD_SECONDS = 3; // 3 seconds for faster detection
// Initialize UDP server for auto discovery // Initialize UDP server for heartbeat-based cluster management
const udpServer = dgram.createSocket('udp4'); const udpServer = dgram.createSocket('udp4');
// Store discovered nodes and their IPs // Store discovered nodes and their IPs
@@ -68,107 +67,16 @@ udpServer.on('message', (msg, rinfo) => {
const message = msg.toString().trim(); const message = msg.toString().trim();
const sourceIp = rinfo.address; const sourceIp = rinfo.address;
const sourcePort = rinfo.port; const sourcePort = rinfo.port;
// Only log non-discovery messages to reduce noise
if (message !== DISCOVERY_MESSAGE) {
console.log(`📨 UDP message received from ${sourceIp}:${sourcePort}: "${message}"`);
}
if (message === DISCOVERY_MESSAGE) {
//console.log(`Received CLUSTER_DISCOVERY from ${sourceIp}:${sourcePort}`);
// Store the discovered node console.log(`📨 UDP message received from ${sourceIp}:${sourcePort}: "${message}"`);
const now = Math.floor(Date.now() / 1000);
const nodeInfo = {
ip: sourceIp,
port: sourcePort,
status: 'active', // New nodes from discovery are active
discoveredAt: now,
lastSeen: now
};
const isNewNode = !discoveredNodes.has(sourceIp); if (message.startsWith('CLUSTER_HEARTBEAT:')) {
discoveredNodes.set(sourceIp, nodeInfo); // Handle heartbeat messages that update member list
// Set as primary node if this is the first one or if we don't have one
if (!primaryNodeIp) {
primaryNodeIp = sourceIp;
console.log(`Set primary node to ${sourceIp}`);
// Immediately try to initialize the client
updateSporeClient();
}
// Update last seen timestamp
discoveredNodes.get(sourceIp).lastSeen = Math.floor(Date.now() / 1000);
// Broadcast discovery event if this is a new node
if (isNewNode) {
console.log(`🆕 NEW NODE DISCOVERED: ${sourceIp}:${sourcePort} via CLUSTER_DISCOVERY. Total nodes: ${discoveredNodes.size}`);
broadcastNodeDiscovery(sourceIp, 'discovered');
// Broadcast cluster update after a short delay to allow member data to be fetched
setTimeout(() => broadcastMemberListChange('new discovery'), 1000);
}
} else if (message.startsWith('CLUSTER_HEARTBEAT:')) {
// Handle heartbeat messages that also update member list
const hostname = message.substring('CLUSTER_HEARTBEAT:'.length); const hostname = message.substring('CLUSTER_HEARTBEAT:'.length);
// Update or create node entry from heartbeat updateNodeFromHeartbeat(sourceIp, sourcePort, hostname);
const existingNode = discoveredNodes.get(sourceIp); } else if (message.startsWith('NODE_UPDATE:')) {
const now = Math.floor(Date.now() / 1000); // Use Unix timestamp for consistency // Handle node update messages that provide detailed node info
handleNodeUpdate(sourceIp, message);
if (existingNode) {
// Update existing node
const wasStale = existingNode.status === 'inactive';
const oldHostname = existingNode.hostname;
existingNode.lastSeen = now;
existingNode.hostname = hostname;
existingNode.status = 'active'; // Mark as active when heartbeat received
logger.debug(`💓 Heartbeat from ${sourceIp}:${sourcePort} (${hostname}). Total nodes: ${discoveredNodes.size}`);
// Check if hostname changed
const hostnameChanged = oldHostname !== hostname;
if (hostnameChanged) {
console.log(`🔄 Hostname updated for ${sourceIp}: "${oldHostname}" -> "${hostname}"`);
}
// ALWAYS broadcast every heartbeat for immediate UI updates
// This ensures the UI gets real-time updates without delay
const reason = wasStale ? 'node became active' :
hostnameChanged ? 'hostname update' :
'active heartbeat';
logger.debug(`📡 Broadcasting heartbeat update: ${reason}`);
broadcastMemberListChange(reason);
} else {
// Create new node entry from heartbeat - NEW NODE DISCOVERED
const nodeInfo = {
ip: sourceIp,
port: sourcePort,
hostname: hostname,
status: 'active', // New nodes from heartbeat are active
discoveredAt: now,
lastSeen: now
};
discoveredNodes.set(sourceIp, nodeInfo);
console.log(`🆕 NEW NODE DISCOVERED: ${sourceIp}:${sourcePort} (${hostname}) via heartbeat. Total nodes: ${discoveredNodes.size}`);
// Set as primary node if this is the first one or if we don't have one
if (!primaryNodeIp) {
primaryNodeIp = sourceIp;
console.log(`Set primary node to ${sourceIp} (from heartbeat)`);
// Immediately try to initialize the client
updateSporeClient();
}
// Broadcast discovery event for new node from heartbeat
broadcastNodeDiscovery(sourceIp, 'discovered');
// Broadcast cluster update after a short delay to allow member data to be fetched
setTimeout(() => broadcastMemberListChange('new heartbeat discovery'), 1000);
}
} else if (!message.startsWith('RAW:')) { } else if (!message.startsWith('RAW:')) {
console.log(`Received unknown message from ${sourceIp}:${sourcePort}: "${message}"`); console.log(`Received unknown message from ${sourceIp}:${sourcePort}: "${message}"`);
} }
@@ -179,12 +87,12 @@ udpServer.on('message', (msg, rinfo) => {
udpServer.on('listening', () => { udpServer.on('listening', () => {
const address = udpServer.address(); const address = udpServer.address();
console.log(`UDP discovery server listening on ${address.address}:${address.port}`); console.log(`UDP heartbeat server listening on ${address.address}:${address.port}`);
}); });
// Bind UDP server to listen for discovery messages // Bind UDP server to listen for heartbeat messages
udpServer.bind(UDP_PORT, () => { udpServer.bind(UDP_PORT, () => {
console.log(`UDP discovery server bound to port ${UDP_PORT}`); console.log(`UDP heartbeat server bound to port ${UDP_PORT}`);
}); });
// Initialize the SPORE API client with dynamic IP // Initialize the SPORE API client with dynamic IP
@@ -215,7 +123,7 @@ function markStaleNodes() {
for (const [ip, node] of discoveredNodes.entries()) { for (const [ip, node] of discoveredNodes.entries()) {
const timeSinceLastSeen = now - node.lastSeen; const timeSinceLastSeen = now - node.lastSeen;
if (timeSinceLastSeen > STALE_THRESHOLD_SECONDS && node.status !== 'inactive') { if (timeSinceLastSeen > STALE_THRESHOLD_SECONDS && node.status !== 'inactive') {
console.log(`💀 NODE MARKED INACTIVE: ${ip} (${node.hostname || 'Unknown'}) - last seen ${timeSinceLastSeen}s ago (threshold: ${STALE_THRESHOLD_SECONDS}s)`); console.log(`💀 NODE MARKED INACTIVE: ${ip} (${node.hostname || 'Unknown'}) - last seen ${timeSinceLastSeen}s ago (threshold: ${STALE_THRESHOLD_SECONDS}s)`);
node.status = 'inactive'; node.status = 'inactive';
@@ -358,6 +266,109 @@ async function performWithFailover(operation) {
throw lastError || new Error('All discovered nodes failed'); throw lastError || new Error('All discovered nodes failed');
} }
// Function to update node from heartbeat message
function updateNodeFromHeartbeat(sourceIp, sourcePort, hostname) {
const existingNode = discoveredNodes.get(sourceIp);
const now = Math.floor(Date.now() / 1000);
if (existingNode) {
// Update existing node
const wasStale = existingNode.status === 'inactive';
const oldHostname = existingNode.hostname;
existingNode.lastSeen = now;
existingNode.hostname = hostname;
existingNode.status = 'active'; // Mark as active when heartbeat received
logger.debug(`💓 Heartbeat from ${sourceIp}:${sourcePort} (${hostname}). Total nodes: ${discoveredNodes.size}`);
// Check if hostname changed
const hostnameChanged = oldHostname !== hostname;
if (hostnameChanged) {
console.log(`🔄 Hostname updated for ${sourceIp}: "${oldHostname}" -> "${hostname}"`);
}
// Broadcast heartbeat update for immediate UI updates
const reason = wasStale ? 'node became active' :
hostnameChanged ? 'hostname update' :
'active heartbeat';
logger.debug(`📡 Broadcasting heartbeat update: ${reason}`);
broadcastMemberListChange(reason);
} else {
// Create new node entry from heartbeat - NEW NODE DISCOVERED
const nodeInfo = {
ip: sourceIp,
port: sourcePort,
hostname: hostname,
status: 'active',
discoveredAt: now,
lastSeen: now
};
discoveredNodes.set(sourceIp, nodeInfo);
console.log(`🆕 NEW NODE DISCOVERED: ${sourceIp}:${sourcePort} (${hostname}) via heartbeat. Total nodes: ${discoveredNodes.size}`);
// Set as primary node if this is the first one or if we don't have one
if (!primaryNodeIp) {
primaryNodeIp = sourceIp;
console.log(`Set primary node to ${sourceIp} (from heartbeat)`);
updateSporeClient();
}
// Broadcast discovery event for new node from heartbeat
broadcastNodeDiscovery(sourceIp, 'discovered');
// Broadcast cluster update after a short delay to allow member data to be fetched
setTimeout(() => broadcastMemberListChange('new heartbeat discovery'), 1000);
}
}
// Function to handle NODE_UPDATE messages
function handleNodeUpdate(sourceIp, message) {
// Message format: "NODE_UPDATE:hostname:{json}"
const parts = message.split(':');
if (parts.length < 3) {
console.warn(`Invalid NODE_UPDATE message format: ${message}`);
return;
}
const hostname = parts[1];
const jsonData = parts.slice(2).join(':');
try {
const nodeData = JSON.parse(jsonData);
// Update the specific node with the new information
const existingNode = discoveredNodes.get(sourceIp);
if (existingNode) {
// Update hostname if provided
if (nodeData.hostname) {
existingNode.hostname = nodeData.hostname;
}
// Update uptime if provided
if (nodeData.uptime) {
existingNode.uptime = nodeData.uptime;
}
// Update labels if provided
if (nodeData.labels) {
existingNode.labels = nodeData.labels;
}
existingNode.lastSeen = Math.floor(Date.now() / 1000);
existingNode.status = 'active';
console.log(`🔄 Updated node ${sourceIp} (${hostname}) from NODE_UPDATE`);
broadcastMemberListChange('node update');
} else {
console.warn(`Received NODE_UPDATE for unknown node: ${sourceIp} (${hostname})`);
}
} catch (error) {
console.error(`Error parsing NODE_UPDATE JSON: ${error.message}`);
}
}
// Set up periodic tasks // Set up periodic tasks
setInterval(() => { setInterval(() => {
markStaleNodes(); markStaleNodes();
@@ -374,7 +385,7 @@ app.get('/', (req, res) => {
res.sendFile(path.join(__dirname, 'public', 'index.html')); res.sendFile(path.join(__dirname, 'public', 'index.html'));
}); });
// API endpoint to get discovered nodes // API endpoint to get cluster nodes (heartbeat-based)
app.get('/api/discovery/nodes', (req, res) => { app.get('/api/discovery/nodes', (req, res) => {
const nodes = Array.from(discoveredNodes.values()).map(node => ({ const nodes = Array.from(discoveredNodes.values()).map(node => ({
...node, ...node,
@@ -382,22 +393,21 @@ app.get('/api/discovery/nodes', (req, res) => {
lastSeen: new Date(node.lastSeen * 1000).toISOString(), lastSeen: new Date(node.lastSeen * 1000).toISOString(),
isPrimary: node.ip === primaryNodeIp isPrimary: node.ip === primaryNodeIp
})); }));
res.json({ res.json({
primaryNode: primaryNodeIp, primaryNode: primaryNodeIp,
totalNodes: discoveredNodes.size, totalNodes: discoveredNodes.size,
nodes: nodes, nodes: nodes,
clientInitialized: !!sporeClient, clientInitialized: !!sporeClient,
clientBaseUrl: sporeClient ? sporeClient.baseUrl : null, clientBaseUrl: sporeClient ? sporeClient.baseUrl : null,
discoveryStatus: { clusterStatus: {
udpPort: UDP_PORT, udpPort: UDP_PORT,
discoveryMessage: DISCOVERY_MESSAGE,
serverRunning: udpServer.listening serverRunning: udpServer.listening
} }
}); });
}); });
// API endpoint to manually trigger discovery refresh // API endpoint to manually trigger cluster refresh
app.post('/api/discovery/refresh', (req, res) => { app.post('/api/discovery/refresh', (req, res) => {
try { try {
// Mark stale nodes as inactive // Mark stale nodes as inactive
@@ -411,15 +421,15 @@ app.post('/api/discovery/refresh', (req, res) => {
res.json({ res.json({
success: true, success: true,
message: 'Discovery refresh completed', message: 'Cluster refresh completed',
primaryNode: primaryNodeIp, primaryNode: primaryNodeIp,
totalNodes: discoveredNodes.size, totalNodes: discoveredNodes.size,
clientInitialized: !!sporeClient clientInitialized: !!sporeClient
}); });
} catch (error) { } catch (error) {
console.error('Error during discovery refresh:', error); console.error('Error during cluster refresh:', error);
res.status(500).json({ res.status(500).json({
error: 'Discovery refresh failed', error: 'Cluster refresh failed',
message: error.message message: error.message
}); });
} }
@@ -526,7 +536,7 @@ app.get('/api/cluster/members', async (req, res) => {
if (discoveredNodes.size === 0) { if (discoveredNodes.size === 0) {
return res.status(503).json({ return res.status(503).json({
error: 'Service unavailable', error: 'Service unavailable',
message: 'No SPORE nodes discovered yet. Waiting for CLUSTER_DISCOVERY messages...', message: 'No SPORE nodes discovered yet. Waiting for CLUSTER_HEARTBEAT messages...',
discoveredNodes: Array.from(discoveredNodes.keys()) discoveredNodes: Array.from(discoveredNodes.keys())
}); });
} }
@@ -564,7 +574,7 @@ app.get('/api/tasks/status', async (req, res) => {
if (discoveredNodes.size === 0) { if (discoveredNodes.size === 0) {
return res.status(503).json({ return res.status(503).json({
error: 'Service unavailable', error: 'Service unavailable',
message: 'No SPORE nodes discovered yet. Waiting for CLUSTER_DISCOVERY messages...', message: 'No SPORE nodes discovered yet. Waiting for CLUSTER_HEARTBEAT messages...',
discoveredNodes: Array.from(discoveredNodes.keys()) discoveredNodes: Array.from(discoveredNodes.keys())
}); });
} }
@@ -624,7 +634,7 @@ app.get('/api/node/endpoints', async (req, res) => {
if (discoveredNodes.size === 0) { if (discoveredNodes.size === 0) {
return res.status(503).json({ return res.status(503).json({
error: 'Service unavailable', error: 'Service unavailable',
message: 'No SPORE nodes discovered yet. Waiting for CLUSTER_DISCOVERY messages...', message: 'No SPORE nodes discovered yet. Waiting for CLUSTER_HEARTBEAT messages...',
discoveredNodes: Array.from(discoveredNodes.keys()) discoveredNodes: Array.from(discoveredNodes.keys())
}); });
} }
@@ -867,28 +877,28 @@ app.get('/api/health', (req, res) => {
udp: udpServer.listening, udp: udpServer.listening,
sporeClient: !!sporeClient sporeClient: !!sporeClient
}, },
discovery: { cluster: {
totalNodes: discoveredNodes.size, totalNodes: discoveredNodes.size,
primaryNode: primaryNodeIp, primaryNode: primaryNodeIp,
udpPort: UDP_PORT, udpPort: UDP_PORT,
serverRunning: udpServer.listening serverRunning: udpServer.listening
} }
}; };
// If no nodes discovered, mark as degraded // If no nodes discovered, mark as degraded
if (discoveredNodes.size === 0) { if (discoveredNodes.size === 0) {
health.status = 'degraded'; health.status = 'degraded';
health.message = 'No SPORE nodes discovered yet'; health.message = 'No SPORE nodes discovered yet';
} }
// If no client initialized, mark as degraded // If no client initialized, mark as degraded
if (!sporeClient) { if (!sporeClient) {
health.status = 'degraded'; health.status = 'degraded';
health.message = health.message ? health.message = health.message ?
`${health.message}; SPORE client not initialized` : `${health.message}; SPORE client not initialized` :
'SPORE client not initialized'; 'SPORE client not initialized';
} }
const statusCode = health.status === 'healthy' ? 200 : 503; const statusCode = health.status === 'healthy' ? 200 : 503;
res.status(statusCode).json(health); res.status(statusCode).json(health);
}); });
@@ -1106,20 +1116,20 @@ function initializeWebSocketServer(httpServer) {
const server = app.listen(PORT, '0.0.0.0', () => { const server = app.listen(PORT, '0.0.0.0', () => {
console.log(`Server is running on http://0.0.0.0:${PORT}`); console.log(`Server is running on http://0.0.0.0:${PORT}`);
console.log(`Accessible from: http://YOUR_COMPUTER_IP:${PORT}`); console.log(`Accessible from: http://YOUR_COMPUTER_IP:${PORT}`);
console.log(`UDP discovery server listening on port ${UDP_PORT}`); console.log(`UDP heartbeat server listening on port ${UDP_PORT}`);
// Initialize WebSocket server after HTTP server is running // Initialize WebSocket server after HTTP server is running
initializeWebSocketServer(server); initializeWebSocketServer(server);
console.log('WebSocket server ready for real-time updates'); console.log('WebSocket server ready for real-time updates');
console.log('Waiting for CLUSTER_DISCOVERY messages from SPORE nodes...'); console.log('Waiting for CLUSTER_HEARTBEAT and NODE_UPDATE messages from SPORE nodes...');
}); });
// Graceful shutdown handling // Graceful shutdown handling
process.on('SIGINT', () => { process.on('SIGINT', () => {
console.log('\nReceived SIGINT. Shutting down gracefully...'); console.log('\nReceived SIGINT. Shutting down gracefully...');
udpServer.close(() => { udpServer.close(() => {
console.log('UDP server closed.'); console.log('UDP heartbeat server closed.');
}); });
server.close(() => { server.close(() => {
console.log('HTTP server closed.'); console.log('HTTP server closed.');
@@ -1130,7 +1140,7 @@ process.on('SIGINT', () => {
process.on('SIGTERM', () => { process.on('SIGTERM', () => {
console.log('\nReceived SIGTERM. Shutting down gracefully...'); console.log('\nReceived SIGTERM. Shutting down gracefully...');
udpServer.close(() => { udpServer.close(() => {
console.log('UDP server closed.'); console.log('UDP heartbeat server closed.');
}); });
server.close(() => { server.close(() => {
console.log('HTTP server closed.'); console.log('HTTP server closed.');

View File

@@ -23,8 +23,8 @@ async function main() {
await runExamples(client); await runExamples(client);
} else { } else {
console.log('❌ No nodes discovered yet.'); console.log('❌ No nodes discovered yet.');
console.log('💡 Start the backend server and send CLUSTER_DISCOVERY messages'); console.log('💡 Start the backend server and send CLUSTER_HEARTBEAT messages');
console.log('💡 Use: npm run test-discovery broadcast'); console.log('💡 Use: npm run test-heartbeat broadcast');
return; return;
} }
} catch (error) { } catch (error) {