feat: more logging

This commit is contained in:
2025-10-24 21:45:40 +02:00
parent fba1f162b3
commit 8aa8b908e6
4 changed files with 441 additions and 26 deletions

View File

@@ -366,16 +366,19 @@ func (hs *HTTPServer) setPrimaryNode(w http.ResponseWriter, r *http.Request) {
// GET /api/cluster/members
func (hs *HTTPServer) getClusterMembers(w http.ResponseWriter, r *http.Request) {
log.Debug("Fetching cluster members via API")
result, err := hs.performWithFailover(func(client *client.SporeClient) (interface{}, error) {
return client.GetClusterStatus()
})
if err != nil {
log.WithError(err).Error("Error fetching cluster members")
log.WithError(err).Debug("Failed to fetch cluster members")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch cluster members", "message": "%s"}`, err.Error()), http.StatusBadGateway)
return
}
log.Debug("Successfully fetched cluster members via API")
json.NewEncoder(w).Encode(result)
}
@@ -417,42 +420,52 @@ func (hs *HTTPServer) getTaskStatus(w http.ResponseWriter, r *http.Request) {
ip := r.URL.Query().Get("ip")
if ip != "" {
log.WithField("node_ip", ip).Debug("Fetching task status from specific node")
client := hs.getSporeClient(ip)
result, err := client.GetTaskStatus()
if err != nil {
log.WithError(err).Error("Error fetching task status from specific node")
log.WithFields(log.Fields{
"node_ip": ip,
"error": err.Error(),
}).Debug("Failed to fetch task status from specific node")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch task status from node", "message": "%s"}`, err.Error()), http.StatusInternalServerError)
return
}
log.WithField("node_ip", ip).Debug("Successfully fetched task status from specific node")
json.NewEncoder(w).Encode(result)
return
}
log.Debug("Fetching task status via failover")
result, err := hs.performWithFailover(func(client *client.SporeClient) (interface{}, error) {
return client.GetTaskStatus()
})
if err != nil {
log.WithError(err).Error("Error fetching task status")
log.WithError(err).Debug("Failed to fetch task status via failover")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch task status", "message": "%s"}`, err.Error()), http.StatusBadGateway)
return
}
log.Debug("Successfully fetched task status via failover")
json.NewEncoder(w).Encode(result)
}
// GET /api/node/status
func (hs *HTTPServer) getNodeStatus(w http.ResponseWriter, r *http.Request) {
log.Debug("Fetching node system status via failover")
result, err := hs.performWithFailover(func(client *client.SporeClient) (interface{}, error) {
return client.GetSystemStatus()
})
if err != nil {
log.WithError(err).Error("Error fetching system status")
log.WithError(err).Debug("Failed to fetch system status via failover")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch system status", "message": "%s"}`, err.Error()), http.StatusBadGateway)
return
}
log.Debug("Successfully fetched system status via failover")
json.NewEncoder(w).Encode(result)
}
@@ -461,14 +474,20 @@ func (hs *HTTPServer) getNodeStatusByIP(w http.ResponseWriter, r *http.Request)
vars := mux.Vars(r)
nodeIP := vars["ip"]
log.WithField("node_ip", nodeIP).Debug("Fetching system status from specific node")
client := hs.getSporeClient(nodeIP)
result, err := client.GetSystemStatus()
if err != nil {
log.WithError(err).Error("Error fetching status from specific node")
log.WithFields(log.Fields{
"node_ip": nodeIP,
"error": err.Error(),
}).Debug("Failed to fetch status from specific node")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch status from node %s", "message": "%s"}`, nodeIP, err.Error()), http.StatusInternalServerError)
return
}
log.WithField("node_ip", nodeIP).Debug("Successfully fetched status from specific node")
json.NewEncoder(w).Encode(result)
}
@@ -477,27 +496,34 @@ func (hs *HTTPServer) getNodeEndpoints(w http.ResponseWriter, r *http.Request) {
ip := r.URL.Query().Get("ip")
if ip != "" {
log.WithField("node_ip", ip).Debug("Fetching endpoints from specific node")
client := hs.getSporeClient(ip)
result, err := client.GetCapabilities()
if err != nil {
log.WithError(err).Error("Error fetching endpoints from specific node")
log.WithFields(log.Fields{
"node_ip": ip,
"error": err.Error(),
}).Debug("Failed to fetch endpoints from specific node")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch endpoints from node", "message": "%s"}`, err.Error()), http.StatusInternalServerError)
return
}
log.WithField("node_ip", ip).Debug("Successfully fetched endpoints from specific node")
json.NewEncoder(w).Encode(result)
return
}
log.Debug("Fetching capabilities via failover")
result, err := hs.performWithFailover(func(client *client.SporeClient) (interface{}, error) {
return client.GetCapabilities()
})
if err != nil {
log.WithError(err).Error("Error fetching capabilities")
log.WithError(err).Debug("Failed to fetch capabilities via failover")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch capabilities", "message": "%s"}`, err.Error()), http.StatusBadGateway)
return
}
log.Debug("Successfully fetched capabilities via failover")
json.NewEncoder(w).Encode(result)
}
@@ -849,18 +875,21 @@ type ClusterNodeVersionsResponse struct {
// GET /api/cluster/node/versions
func (hs *HTTPServer) getClusterNodeVersions(w http.ResponseWriter, r *http.Request) {
log.Debug("Fetching cluster node versions")
result, err := hs.performWithFailover(func(client *client.SporeClient) (interface{}, error) {
return client.GetClusterStatus()
})
if err != nil {
log.WithError(err).Error("Error fetching cluster members for versions")
log.WithError(err).Debug("Failed to fetch cluster members for versions")
http.Error(w, fmt.Sprintf(`{"error": "Failed to fetch cluster members", "message": "%s"}`, err.Error()), http.StatusBadGateway)
return
}
clusterStatus, ok := result.(*client.ClusterStatusResponse)
if !ok {
log.Debug("Invalid cluster status response type")
http.Error(w, `{"error": "Invalid cluster status response"}`, http.StatusInternalServerError)
return
}
@@ -880,6 +909,8 @@ func (hs *HTTPServer) getClusterNodeVersions(w http.ResponseWriter, r *http.Requ
})
}
log.WithField("node_count", len(nodeVersions)).Debug("Successfully fetched cluster node versions")
response := ClusterNodeVersionsResponse{
Nodes: nodeVersions,
}
@@ -956,12 +987,25 @@ func (hs *HTTPServer) nodeMatchesLabels(nodeLabels, rolloutLabels map[string]str
// processRollout handles the actual rollout process in the background
func (hs *HTTPServer) processRollout(rolloutID string, nodes []NodeInfo, firmwareInfo FirmwareInfo) {
log.WithField("rollout_id", rolloutID).Info("Starting background rollout process")
log.WithFields(log.Fields{
"rollout_id": rolloutID,
"firmware": fmt.Sprintf("%s/%s", firmwareInfo.Name, firmwareInfo.Version),
"node_count": len(nodes),
}).Debug("Starting background rollout process")
// Download firmware from registry
log.WithFields(log.Fields{
"rollout_id": rolloutID,
"firmware": fmt.Sprintf("%s/%s", firmwareInfo.Name, firmwareInfo.Version),
}).Debug("Downloading firmware from registry for rollout")
firmwareData, err := hs.registryClient.DownloadFirmware(firmwareInfo.Name, firmwareInfo.Version)
if err != nil {
log.WithError(err).Error("Failed to download firmware for rollout")
log.WithFields(log.Fields{
"rollout_id": rolloutID,
"firmware": fmt.Sprintf("%s/%s", firmwareInfo.Name, firmwareInfo.Version),
"error": err.Error(),
}).Error("Failed to download firmware for rollout")
return
}
@@ -970,7 +1014,7 @@ func (hs *HTTPServer) processRollout(rolloutID string, nodes []NodeInfo, firmwar
"firmware": fmt.Sprintf("%s/%s", firmwareInfo.Name, firmwareInfo.Version),
"size": len(firmwareData),
"total_nodes": len(nodes),
}).Info("Downloaded firmware for rollout")
}).Debug("Successfully downloaded firmware for rollout")
// Process nodes in parallel using goroutines
var wg sync.WaitGroup
@@ -984,9 +1028,14 @@ func (hs *HTTPServer) processRollout(rolloutID string, nodes []NodeInfo, firmwar
"rollout_id": rolloutID,
"node_ip": node.IP,
"progress": fmt.Sprintf("%d/%d", nodeIndex+1, len(nodes)),
}).Info("Processing node in rollout")
}).Debug("Processing node in rollout")
// Update version label on the node before upload
log.WithFields(log.Fields{
"rollout_id": rolloutID,
"node_ip": node.IP,
}).Debug("Getting SPORE client for node")
client := hs.getSporeClient(node.IP)
// Create updated labels with the new version

View File

@@ -429,20 +429,36 @@ func (wss *WebSocketServer) calculateProgress(current, total int, status string)
func (wss *WebSocketServer) getCurrentClusterMembers() ([]client.ClusterMember, error) {
nodes := wss.nodeDiscovery.GetNodes()
if len(nodes) == 0 {
wss.logger.Debug("No nodes available for cluster member retrieval")
return []client.ClusterMember{}, nil
}
// Try to get real cluster data from primary node
primaryNode := wss.nodeDiscovery.GetPrimaryNode()
if primaryNode != "" {
wss.logger.WithFields(log.Fields{
"primary_node": primaryNode,
"total_nodes": len(nodes),
}).Debug("Fetching cluster members from primary node")
client := wss.getSporeClient(primaryNode)
clusterStatus, err := client.GetClusterStatus()
if err == nil {
wss.logger.WithFields(log.Fields{
"primary_node": primaryNode,
"member_count": len(clusterStatus.Members),
}).Debug("Successfully fetched cluster members from primary node")
// Update local node data with API information
wss.updateLocalNodesWithAPI(clusterStatus.Members)
return clusterStatus.Members, nil
}
wss.logger.WithError(err).Error("Failed to get cluster status from primary node")
wss.logger.WithFields(log.Fields{
"primary_node": primaryNode,
"error": err.Error(),
}).Debug("Failed to get cluster status from primary node, using fallback")
} else {
wss.logger.Debug("No primary node available, using fallback cluster members")
}
// Fallback to local data if API fails