Files
spore/src/spore/core/ClusterManager.cpp

546 lines
21 KiB
C++

#include "spore/core/ClusterManager.h"
#include "spore/internal/Globals.h"
#include "spore/util/Logging.h"
ClusterManager::ClusterManager(NodeContext& ctx, TaskManager& taskMgr) : ctx(ctx), taskManager(taskMgr) {
// Register callback for node_discovered event
ctx.on("node_discovered", [this](void* data) {
NodeInfo* node = static_cast<NodeInfo*>(data);
this->addOrUpdateNode(node->hostname, node->ip);
});
// Centralized broadcast handler: services fire 'cluster/broadcast' with CLUSTER_EVENT JSON payload
ctx.on("cluster/broadcast", [this](void* data) {
String* jsonStr = static_cast<String*>(data);
if (!jsonStr) {
LOG_WARN("Cluster", "cluster/broadcast called with null data");
return;
}
// Subnet-directed broadcast (more reliable than 255.255.255.255 on some networks)
IPAddress ip = WiFi.localIP();
IPAddress mask = WiFi.subnetMask();
IPAddress bcast(ip[0] | ~mask[0], ip[1] | ~mask[1], ip[2] | ~mask[2], ip[3] | ~mask[3]);
LOG_DEBUG("Cluster", String("Broadcasting CLUSTER_EVENT to ") + bcast.toString() + " len=" + String(jsonStr->length()));
this->ctx.udp->beginPacket(bcast, this->ctx.config.udp_port);
String msg = String(ClusterProtocol::CLUSTER_EVENT_MSG) + ":" + *jsonStr;
this->ctx.udp->write(msg.c_str());
this->ctx.udp->endPacket();
});
// Handler for node update broadcasts: services fire 'cluster/node_update' when their node info changes
ctx.on("cluster/node_update", [this](void* data) {
// Trigger immediate NODE_UPDATE broadcast when node info changes
broadcastNodeUpdate();
});
// Register tasks
registerTasks();
initMessageHandlers();
}
void ClusterManager::registerTasks() {
taskManager.registerTask("cluster_listen", ctx.config.cluster_listen_interval_ms, [this]() { listen(); });
taskManager.registerTask("status_update", ctx.config.status_update_interval_ms, [this]() { updateAllNodeStatuses(); removeDeadNodes(); });
taskManager.registerTask("print_members", ctx.config.print_interval_ms, [this]() { printMemberList(); });
taskManager.registerTask("heartbeat", ctx.config.heartbeat_interval_ms, [this]() { heartbeatTaskCallback(); });
taskManager.registerTask("node_update_broadcast", ctx.config.node_update_broadcast_interval_ms, [this]() { broadcastNodeUpdate(); });
LOG_INFO("ClusterManager", "Registered all cluster tasks");
}
// Discovery functionality removed - using heartbeat-only approach
void ClusterManager::listen() {
int packetSize = ctx.udp->parsePacket();
if (!packetSize) {
return;
}
char incoming[ClusterProtocol::UDP_BUF_SIZE];
int len = ctx.udp->read(incoming, ClusterProtocol::UDP_BUF_SIZE);
if (len <= 0) {
return;
}
if (len >= (int)ClusterProtocol::UDP_BUF_SIZE) {
incoming[ClusterProtocol::UDP_BUF_SIZE - 1] = 0;
} else {
incoming[len] = 0;
}
handleIncomingMessage(incoming);
}
void ClusterManager::initMessageHandlers() {
messageHandlers.clear();
messageHandlers.push_back({ &ClusterManager::isRawMsg, [this](const char* msg){ this->onRawMessage(msg); }, "RAW" });
messageHandlers.push_back({ &ClusterManager::isHeartbeatMsg, [this](const char* msg){ this->onHeartbeat(msg); }, "HEARTBEAT" });
messageHandlers.push_back({ &ClusterManager::isNodeUpdateMsg, [this](const char* msg){ this->onNodeUpdate(msg); }, "NODE_UPDATE" });
messageHandlers.push_back({ &ClusterManager::isClusterEventMsg, [this](const char* msg){ this->onClusterEvent(msg); }, "CLUSTER_EVENT" });
}
void ClusterManager::handleIncomingMessage(const char* incoming) {
for (const auto& h : messageHandlers) {
if (h.predicate(incoming)) {
h.handle(incoming);
return;
}
}
// Unknown message - log first token
const char* colon = strchr(incoming, ':');
String head;
if (colon) {
head = String(incoming).substring(0, colon - incoming);
} else {
head = String(incoming);
}
LOG_DEBUG("Cluster", String("Unknown cluster message: ") + head);
}
bool ClusterManager::isHeartbeatMsg(const char* msg) {
return strncmp(msg, ClusterProtocol::HEARTBEAT_MSG, strlen(ClusterProtocol::HEARTBEAT_MSG)) == 0;
}
bool ClusterManager::isNodeUpdateMsg(const char* msg) {
return strncmp(msg, ClusterProtocol::NODE_UPDATE_MSG, strlen(ClusterProtocol::NODE_UPDATE_MSG)) == 0;
}
bool ClusterManager::isClusterEventMsg(const char* msg) {
return strncmp(msg, ClusterProtocol::CLUSTER_EVENT_MSG, strlen(ClusterProtocol::CLUSTER_EVENT_MSG)) == 0;
}
bool ClusterManager::isRawMsg(const char* msg) {
// RAW frames must be "RAW:<payload>"; enforce the delimiter so we skip things like "RAW_HEARTBEAT".
const std::size_t prefixLen = strlen(ClusterProtocol::RAW_MSG);
if (strncmp(msg, ClusterProtocol::RAW_MSG, prefixLen) != 0) {
return false;
}
return msg[prefixLen] == ':';
}
// Discovery functionality removed - using heartbeat-only approach
void ClusterManager::onHeartbeat(const char* msg) {
// Extract hostname from heartbeat message: "CLUSTER_HEARTBEAT:hostname"
const char* colon = strchr(msg, ':');
if (!colon) {
LOG_WARN("Cluster", "Invalid heartbeat message format");
return;
}
String hostname = String(colon + 1);
IPAddress senderIP = ctx.udp->remoteIP();
// Update memberlist with the heartbeat
addOrUpdateNode(hostname, senderIP);
// Respond with minimal node info (hostname, ip, uptime, labels)
sendNodeInfo(hostname, senderIP);
}
void ClusterManager::onNodeUpdate(const char* msg) {
// Message format: "NODE_UPDATE:hostname:{json}"
const char* firstColon = strchr(msg, ':');
if (!firstColon) {
LOG_WARN("Cluster", "Invalid NODE_UPDATE message format");
return;
}
const char* secondColon = strchr(firstColon + 1, ':');
if (!secondColon) {
LOG_WARN("Cluster", "Invalid NODE_UPDATE message format");
return;
}
String hostnamePart = String(firstColon + 1);
String hostname = hostnamePart.substring(0, secondColon - firstColon - 1);
const char* jsonCStr = secondColon + 1;
JsonDocument doc;
DeserializationError err = deserializeJson(doc, jsonCStr);
if (err) {
LOG_WARN("Cluster", String("Failed to parse NODE_UPDATE JSON from ") + ctx.udp->remoteIP().toString());
return;
}
// Update the specific node in memberlist
auto& memberList = *ctx.memberList;
auto it = memberList.find(hostname);
if (it != memberList.end()) {
NodeInfo& node = it->second;
// Update basic info if provided
if (doc["hostname"].is<const char*>()) {
node.hostname = doc["hostname"].as<const char*>();
}
if (doc["uptime"].is<unsigned long>()) {
node.uptime = doc["uptime"];
}
// Update labels if provided
if (doc["labels"].is<JsonObject>()) {
node.labels.clear();
JsonObject labelsObj = doc["labels"].as<JsonObject>();
for (JsonPair kvp : labelsObj) {
const char* key = kvp.key().c_str();
const char* value = labelsObj[kvp.key()];
node.labels[key] = String(value);
}
}
node.lastSeen = millis();
node.status = NodeInfo::ACTIVE;
LOG_DEBUG("Cluster", String("Updated node ") + hostname + " from NODE_UPDATE");
} else {
LOG_WARN("Cluster", String("Received NODE_UPDATE for unknown node: ") + hostname);
}
}
void ClusterManager::sendNodeInfo(const String& hostname, const IPAddress& targetIP) {
JsonDocument doc;
// Get our node info for the response
auto& memberList = *ctx.memberList;
auto it = memberList.find(ctx.hostname);
if (it != memberList.end()) {
const NodeInfo& node = it->second;
// Minimal response: hostname, ip, uptime, labels
doc["hostname"] = node.hostname;
doc["ip"] = node.ip.toString();
doc["uptime"] = millis() - node.lastSeen; // Approximate uptime
// Add labels if present
if (!node.labels.empty()) {
JsonObject labelsObj = doc["labels"].to<JsonObject>();
for (const auto& kv : node.labels) {
labelsObj[kv.first.c_str()] = kv.second;
}
}
} else {
// Fallback to basic info
doc["hostname"] = ctx.hostname;
doc["ip"] = ctx.localIP.toString();
doc["uptime"] = millis();
}
String json;
serializeJson(doc, json);
ctx.udp->beginPacket(targetIP, ctx.config.udp_port);
String msg = String(ClusterProtocol::NODE_UPDATE_MSG) + ":" + hostname + ":" + json;
ctx.udp->write(msg.c_str());
ctx.udp->endPacket();
LOG_DEBUG("Cluster", String("Sent NODE_UPDATE response to ") + hostname + " @ " + targetIP.toString());
}
void ClusterManager::onClusterEvent(const char* msg) {
// Message format: CLUSTER_EVENT:{"event":"...","data":"<json string>"}
const char* jsonStart = msg + strlen(ClusterProtocol::CLUSTER_EVENT_MSG) + 1; // skip prefix and ':'
if (*jsonStart == '\0') {
LOG_DEBUG("Cluster", "CLUSTER_EVENT received with empty payload");
return;
}
LOG_DEBUG("Cluster", String("CLUSTER_EVENT raw from ") + ctx.udp->remoteIP().toString() + " len=" + String(strlen(jsonStart)));
JsonDocument doc;
DeserializationError err = deserializeJson(doc, jsonStart);
if (err) {
LOG_ERROR("Cluster", String("Failed to parse CLUSTER_EVENT JSON from ") + ctx.udp->remoteIP().toString());
return;
}
// Robust extraction of event and data
String eventStr;
if (doc["event"].is<const char*>()) {
eventStr = doc["event"].as<const char*>();
} else if (doc["event"].is<String>()) {
eventStr = doc["event"].as<String>();
}
String data;
if (doc["data"].is<const char*>()) {
data = doc["data"].as<const char*>();
} else if (doc["data"].is<JsonVariantConst>()) {
// If data is a nested JSON object/array, serialize it back to string
String tmp;
serializeJson(doc["data"], tmp);
data = tmp;
}
if (eventStr.length() == 0 || data.length() == 0) {
String dbg;
serializeJson(doc, dbg);
LOG_WARN("Cluster", String("CLUSTER_EVENT missing 'event' or 'data' | payload=") + dbg);
return;
}
std::string eventKey(eventStr.c_str());
LOG_DEBUG("Cluster", String("Firing event '") + eventStr + "' with dataLen=" + String(data.length()));
ctx.fire(eventKey, &data);
}
void ClusterManager::onRawMessage(const char* msg) {
const std::size_t prefixLen = strlen(ClusterProtocol::RAW_MSG);
if (msg[prefixLen] != ':') {
LOG_WARN("Cluster", "RAW message received without payload delimiter");
return;
}
const char* payloadStart = msg + prefixLen + 1;
if (*payloadStart == '\0') {
LOG_WARN("Cluster", "RAW message received with empty payload");
return;
}
String payload(payloadStart);
ctx.fire("udp/raw", &payload);
}
void ClusterManager::addOrUpdateNode(const String& nodeHost, IPAddress nodeIP) {
auto& memberList = *ctx.memberList;
// O(1) lookup instead of O(n) search
auto it = memberList.find(nodeHost);
if (it != memberList.end()) {
// Update existing node
it->second.ip = nodeIP;
it->second.lastSeen = millis();
//fetchNodeInfo(nodeIP); // Do not fetch here, handled by periodic task
return;
}
// Add new node
NodeInfo newNode;
newNode.hostname = nodeHost;
newNode.ip = nodeIP;
newNode.lastSeen = millis();
updateNodeStatus(newNode, newNode.lastSeen, ctx.config.node_inactive_threshold_ms, ctx.config.node_dead_threshold_ms);
memberList[nodeHost] = newNode;
LOG_INFO("Cluster", "Added node: " + nodeHost + " @ " + newNode.ip.toString() + " | Status: " + statusToStr(newNode.status) + " | last update: 0");
//fetchNodeInfo(nodeIP); // Do not fetch here, handled by periodic task
}
// unused http client to fetch complete node info
void ClusterManager::fetchNodeInfo(const IPAddress& ip) {
if(ip == ctx.localIP) {
LOG_DEBUG("Cluster", "Skipping fetch for local node");
return;
}
unsigned long requestStart = millis();
HTTPClient http;
WiFiClient client;
String url = "http://" + ip.toString() + ClusterProtocol::API_NODE_STATUS;
// Use RAII pattern to ensure http.end() is always called
bool httpInitialized = false;
bool success = false;
httpInitialized = http.begin(client, url);
if (!httpInitialized) {
LOG_ERROR("Cluster", "Failed to initialize HTTP client for " + ip.toString());
return;
}
// Set timeout to prevent hanging
http.setTimeout(5000); // 5 second timeout
int httpCode = http.GET();
unsigned long requestEnd = millis();
unsigned long requestDuration = requestEnd - requestStart;
if (httpCode == 200) {
String payload = http.getString();
// Use stack-allocated JsonDocument with proper cleanup
JsonDocument doc;
DeserializationError err = deserializeJson(doc, payload);
if (!err) {
auto& memberList = *ctx.memberList;
// Still need to iterate since we're searching by IP, not hostname
for (auto& pair : memberList) {
NodeInfo& node = pair.second;
if (node.ip == ip) {
// Update resources efficiently
node.resources.freeHeap = doc["freeHeap"];
node.resources.chipId = doc["chipId"];
node.resources.sdkVersion = (const char*)doc["sdkVersion"];
node.resources.cpuFreqMHz = doc["cpuFreqMHz"];
node.resources.flashChipSize = doc["flashChipSize"];
node.status = NodeInfo::ACTIVE;
node.latency = requestDuration;
node.lastSeen = millis();
// Clear and rebuild endpoints efficiently
node.endpoints.clear();
node.endpoints.reserve(10); // Pre-allocate to avoid reallocations
if (doc["api"].is<JsonArray>()) {
JsonArray apiArr = doc["api"].as<JsonArray>();
for (JsonObject apiObj : apiArr) {
// Use const char* to avoid String copies
const char* uri = apiObj["uri"];
int method = apiObj["method"];
// Create basic EndpointInfo without params for cluster nodes
EndpointInfo endpoint;
endpoint.uri = uri; // String assignment is more efficient than construction
endpoint.method = method;
endpoint.isLocal = false;
endpoint.serviceName = "remote";
node.endpoints.push_back(std::move(endpoint));
}
}
// Parse labels efficiently
node.labels.clear();
if (doc["labels"].is<JsonObject>()) {
JsonObject labelsObj = doc["labels"].as<JsonObject>();
for (JsonPair kvp : labelsObj) {
// Use const char* to avoid String copies
const char* key = kvp.key().c_str();
const char* value = labelsObj[kvp.key()];
node.labels[key] = value;
}
}
LOG_DEBUG("Cluster", "Fetched info for node: " + node.hostname + " @ " + ip.toString());
success = true;
break;
}
}
} else {
LOG_ERROR("Cluster", "JSON parse error for node @ " + ip.toString() + ": " + String(err.c_str()));
}
} else {
LOG_ERROR("Cluster", "Failed to fetch info for node @ " + ip.toString() + ", HTTP code: " + String(httpCode));
}
// Always ensure HTTP client is properly closed
if (httpInitialized) {
http.end();
}
// Log success/failure for debugging
if (!success) {
LOG_DEBUG("Cluster", "Failed to update node info for " + ip.toString());
}
}
void ClusterManager::heartbeatTaskCallback() {
auto& memberList = *ctx.memberList;
auto it = memberList.find(ctx.hostname);
if (it != memberList.end()) {
NodeInfo& node = it->second;
node.lastSeen = millis();
node.status = NodeInfo::ACTIVE;
node.uptime = millis(); // Update uptime
updateLocalNodeResources();
addOrUpdateNode(ctx.hostname, ctx.localIP);
}
// Broadcast heartbeat - peers will respond with NODE_UPDATE
lastHeartbeatSentAt = millis();
ctx.udp->beginPacket("255.255.255.255", ctx.config.udp_port);
String hb = String(ClusterProtocol::HEARTBEAT_MSG) + ":" + ctx.hostname;
ctx.udp->write(hb.c_str());
ctx.udp->endPacket();
LOG_DEBUG("Cluster", String("Sent heartbeat: ") + ctx.hostname);
}
void ClusterManager::updateAllMembersInfoTaskCallback() {
// HTTP-based member info fetching disabled; node info is provided via UDP responses to heartbeats
// No-op to reduce network and memory usage
}
void ClusterManager::broadcastNodeUpdate() {
// Broadcast our current node info as NODE_UPDATE to all cluster members
auto& memberList = *ctx.memberList;
auto it = memberList.find(ctx.hostname);
if (it == memberList.end()) {
return;
}
const NodeInfo& node = it->second;
JsonDocument doc;
doc["hostname"] = node.hostname;
doc["uptime"] = node.uptime;
// Add labels if present
if (!node.labels.empty()) {
JsonObject labelsObj = doc["labels"].to<JsonObject>();
for (const auto& kv : node.labels) {
labelsObj[kv.first.c_str()] = kv.second;
}
}
String json;
serializeJson(doc, json);
// Broadcast to all cluster members
ctx.udp->beginPacket("255.255.255.255", ctx.config.udp_port);
String msg = String(ClusterProtocol::NODE_UPDATE_MSG) + ":" + ctx.hostname + ":" + json;
ctx.udp->write(msg.c_str());
ctx.udp->endPacket();
LOG_DEBUG("Cluster", String("Broadcasted NODE_UPDATE for ") + ctx.hostname);
}
void ClusterManager::updateAllNodeStatuses() {
auto& memberList = *ctx.memberList;
unsigned long now = millis();
for (auto& pair : memberList) {
NodeInfo& node = pair.second;
updateNodeStatus(node, now, ctx.config.node_inactive_threshold_ms, ctx.config.node_dead_threshold_ms);
}
}
void ClusterManager::removeDeadNodes() {
auto& memberList = *ctx.memberList;
unsigned long now = millis();
// Use iterator to safely remove elements from map
for (auto it = memberList.begin(); it != memberList.end(); ) {
unsigned long diff = now - it->second.lastSeen;
if (it->second.status == NodeInfo::DEAD && diff > ctx.config.node_dead_threshold_ms) {
LOG_INFO("Cluster", "Removing node: " + it->second.hostname);
it = memberList.erase(it);
} else {
++it;
}
}
}
void ClusterManager::printMemberList() {
auto& memberList = *ctx.memberList;
if (memberList.empty()) {
LOG_INFO("Cluster", "Member List: empty");
return;
}
LOG_INFO("Cluster", "Member List:");
for (const auto& pair : memberList) {
const NodeInfo& node = pair.second;
LOG_INFO("Cluster", " " + node.hostname + " @ " + node.ip.toString() + " | Status: " + statusToStr(node.status) + " | last seen: " + String(millis() - node.lastSeen));
}
}
void ClusterManager::updateLocalNodeResources() {
auto& memberList = *ctx.memberList;
auto it = memberList.find(ctx.hostname);
if (it != memberList.end()) {
NodeInfo& node = it->second;
uint32_t freeHeap = ESP.getFreeHeap();
node.resources.freeHeap = freeHeap;
node.resources.chipId = ESP.getChipId();
node.resources.sdkVersion = String(ESP.getSdkVersion());
node.resources.cpuFreqMHz = ESP.getCpuFreqMHz();
node.resources.flashChipSize = ESP.getFlashChipSize();
// Log memory warnings if heap is getting low
if (freeHeap < ctx.config.low_memory_threshold_bytes) {
LOG_WARN("Cluster", "Low memory warning: " + String(freeHeap) + " bytes free");
} else if (freeHeap < ctx.config.critical_memory_threshold_bytes) {
LOG_ERROR("Cluster", "Critical memory warning: " + String(freeHeap) + " bytes free");
}
}
}