Merge pull request 'feature/node-info-sync' (#8) from feature/node-info-sync into main
Reviewed-on: #8
This commit is contained in:
60
docs/API.md
60
docs/API.md
@@ -15,12 +15,18 @@ The SPORE system provides a comprehensive RESTful API for monitoring and control
|
|||||||
|
|
||||||
| Endpoint | Method | Description | Response |
|
| Endpoint | Method | Description | Response |
|
||||||
|----------|--------|-------------|----------|
|
|----------|--------|-------------|----------|
|
||||||
| `/api/node/status` | GET | System resource information and API endpoint registry | System metrics and API catalog |
|
| `/api/node/status` | GET | System resource information | System metrics |
|
||||||
| `/api/node/endpoints` | GET | API endpoints and parameters | Detailed endpoint specifications |
|
| `/api/node/endpoints` | GET | API endpoints and parameters | Detailed endpoint specifications |
|
||||||
| `/api/cluster/members` | GET | Cluster membership and node health information | Cluster topology and health status |
|
| `/api/cluster/members` | GET | Cluster membership and node health information | Cluster topology and health status |
|
||||||
| `/api/node/update` | POST | Handle firmware updates via OTA | Update progress and status |
|
| `/api/node/update` | POST | Handle firmware updates via OTA | Update progress and status |
|
||||||
| `/api/node/restart` | POST | Trigger system restart | Restart confirmation |
|
| `/api/node/restart` | POST | Trigger system restart | Restart confirmation |
|
||||||
|
|
||||||
|
### Monitoring API
|
||||||
|
|
||||||
|
| Endpoint | Method | Description | Response |
|
||||||
|
|----------|--------|-------------|----------|
|
||||||
|
| `/api/monitoring/resources` | GET | CPU, memory, filesystem, and uptime | System resource metrics |
|
||||||
|
|
||||||
### Network Management API
|
### Network Management API
|
||||||
|
|
||||||
| Endpoint | Method | Description | Response |
|
| Endpoint | Method | Description | Response |
|
||||||
@@ -140,7 +146,7 @@ Controls the execution state of individual tasks. Supports enabling, disabling,
|
|||||||
|
|
||||||
#### GET /api/node/status
|
#### GET /api/node/status
|
||||||
|
|
||||||
Returns comprehensive system resource information including memory usage, chip details, and a registry of all available API endpoints.
|
Returns comprehensive system resource information including memory usage and chip details. For a list of available API endpoints, use `/api/node/endpoints`.
|
||||||
|
|
||||||
**Response Fields:**
|
**Response Fields:**
|
||||||
- `freeHeap`: Available RAM in bytes
|
- `freeHeap`: Available RAM in bytes
|
||||||
@@ -168,7 +174,7 @@ Returns comprehensive system resource information including memory usage, chip d
|
|||||||
|
|
||||||
#### GET /api/node/endpoints
|
#### GET /api/node/endpoints
|
||||||
|
|
||||||
Returns detailed information about all available API endpoints, including their parameters, types, and validation rules.
|
Returns detailed information about all available API endpoints, including their parameters, types, and validation rules. Methods are returned as strings (e.g., "GET", "POST").
|
||||||
|
|
||||||
**Response Fields:**
|
**Response Fields:**
|
||||||
- `endpoints[]`: Array of endpoint capability objects
|
- `endpoints[]`: Array of endpoint capability objects
|
||||||
@@ -236,6 +242,54 @@ Initiates an over-the-air firmware update. The firmware file should be uploaded
|
|||||||
|
|
||||||
Triggers a system restart. The response will be sent before the restart occurs.
|
Triggers a system restart. The response will be sent before the restart occurs.
|
||||||
|
|
||||||
|
### Monitoring
|
||||||
|
|
||||||
|
#### GET /api/monitoring/resources
|
||||||
|
|
||||||
|
Returns real-time system resource metrics.
|
||||||
|
|
||||||
|
Response Fields:
|
||||||
|
- `cpu.current_usage`: Current CPU usage percent
|
||||||
|
- `cpu.average_usage`: Average CPU usage percent
|
||||||
|
- `cpu.max_usage`: Max observed CPU usage
|
||||||
|
- `cpu.min_usage`: Min observed CPU usage
|
||||||
|
- `cpu.measurement_count`: Number of measurements
|
||||||
|
- `cpu.is_measuring`: Whether measurement is active
|
||||||
|
- `memory.free_heap`: Free heap bytes
|
||||||
|
- `memory.total_heap`: Total heap bytes (approximate)
|
||||||
|
- `memory.heap_fragmentation`: Fragmentation percent (0 on ESP8266)
|
||||||
|
- `filesystem.total_bytes`: LittleFS total bytes
|
||||||
|
- `filesystem.used_bytes`: Used bytes
|
||||||
|
- `filesystem.free_bytes`: Free bytes
|
||||||
|
- `filesystem.usage_percent`: Usage percent
|
||||||
|
- `system.uptime_ms`: Uptime in milliseconds
|
||||||
|
Example Response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"cpu": {
|
||||||
|
"current_usage": 3.5,
|
||||||
|
"average_usage": 2.1,
|
||||||
|
"max_usage": 15.2,
|
||||||
|
"min_usage": 0.0,
|
||||||
|
"measurement_count": 120,
|
||||||
|
"is_measuring": true
|
||||||
|
},
|
||||||
|
"memory": {
|
||||||
|
"free_heap": 48748,
|
||||||
|
"total_heap": 81920,
|
||||||
|
"heap_fragmentation": 0
|
||||||
|
},
|
||||||
|
"filesystem": {
|
||||||
|
"total_bytes": 65536,
|
||||||
|
"used_bytes": 10240,
|
||||||
|
"free_bytes": 55296,
|
||||||
|
"usage_percent": 15.6
|
||||||
|
},
|
||||||
|
"system": {
|
||||||
|
"uptime_ms": 123456
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
### Network Management
|
### Network Management
|
||||||
|
|
||||||
#### GET /api/network/status
|
#### GET /api/network/status
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ The system architecture consists of several key components working together:
|
|||||||
- **Service Registry**: Track available services across the cluster
|
- **Service Registry**: Track available services across the cluster
|
||||||
|
|
||||||
### Task Scheduler
|
### Task Scheduler
|
||||||
- **Cooperative Multitasking**: Background task management system
|
- **Cooperative Multitasking**: Background task management system (`TaskManager`)
|
||||||
- **Task Lifecycle Management**: Automatic task execution and monitoring
|
- **Task Lifecycle Management**: Enable/disable tasks and set intervals at runtime
|
||||||
- **Resource Optimization**: Efficient task scheduling and execution
|
- **Execution Model**: Tasks run in `Spore::loop()` when their interval elapses
|
||||||
|
|
||||||
### Node Context
|
### Node Context
|
||||||
- **Central Context**: Shared resources and configuration
|
- **Central Context**: Shared resources and configuration
|
||||||
@@ -40,27 +40,75 @@ The cluster uses a UDP-based discovery protocol for automatic node detection:
|
|||||||
|
|
||||||
### Discovery Process
|
### Discovery Process
|
||||||
|
|
||||||
1. **Discovery Broadcast**: Nodes periodically send UDP packets on port 4210
|
1. **Discovery Broadcast**: Nodes periodically send UDP packets on port `udp_port` (default 4210)
|
||||||
2. **Response Handling**: Nodes respond with their hostname and IP address
|
2. **Response Handling**: Nodes respond with `CLUSTER_RESPONSE:<hostname>`
|
||||||
3. **Member Management**: Discovered nodes are automatically added to the cluster
|
3. **Member Management**: Discovered nodes are added/updated in the cluster
|
||||||
4. **Health Monitoring**: Continuous status checking via HTTP API calls
|
4. **Node Info via UDP**: Heartbeat triggers peers to send `CLUSTER_NODE_INFO:<hostname>:<json>`
|
||||||
|
|
||||||
### Protocol Details
|
### Protocol Details
|
||||||
|
|
||||||
- **UDP Port**: 4210 (configurable)
|
- **UDP Port**: 4210 (configurable via `Config.udp_port`)
|
||||||
- **Discovery Message**: `CLUSTER_DISCOVERY`
|
- **Discovery Message**: `CLUSTER_DISCOVERY`
|
||||||
- **Response Message**: `CLUSTER_RESPONSE`
|
- **Response Message**: `CLUSTER_RESPONSE`
|
||||||
|
- **Heartbeat Message**: `CLUSTER_HEARTBEAT`
|
||||||
|
- **Node Info Message**: `CLUSTER_NODE_INFO:<hostname>:<json>`
|
||||||
- **Broadcast Address**: 255.255.255.255
|
- **Broadcast Address**: 255.255.255.255
|
||||||
- **Discovery Interval**: 1 second (configurable)
|
- **Discovery Interval**: `Config.discovery_interval_ms` (default 1000 ms)
|
||||||
- **Listen Interval**: 100ms (configurable)
|
- **Listen Interval**: `Config.cluster_listen_interval_ms` (default 10 ms)
|
||||||
|
- **Heartbeat Interval**: `Config.heartbeat_interval_ms` (default 5000 ms)
|
||||||
|
|
||||||
|
### Message Formats
|
||||||
|
|
||||||
|
- **Discovery**: `CLUSTER_DISCOVERY`
|
||||||
|
- Sender: any node, broadcast to 255.255.255.255:`udp_port`
|
||||||
|
- Purpose: announce presence and solicit peer identification
|
||||||
|
- **Response**: `CLUSTER_RESPONSE:<hostname>`
|
||||||
|
- Sender: node receiving a discovery; unicast to requester IP
|
||||||
|
- Purpose: provide hostname so requester can register/update member
|
||||||
|
- **Heartbeat**: `CLUSTER_HEARTBEAT:<hostname>`
|
||||||
|
- Sender: each node, broadcast to 255.255.255.255:`udp_port` on interval
|
||||||
|
- Purpose: prompt peers to reply with their node info and keep liveness
|
||||||
|
- **Node Info**: `CLUSTER_NODE_INFO:<hostname>:<json>`
|
||||||
|
- Sender: node receiving a heartbeat; unicast to heartbeat sender IP
|
||||||
|
- JSON fields: freeHeap, chipId, sdkVersion, cpuFreqMHz, flashChipSize, optional labels
|
||||||
|
|
||||||
|
### Discovery Flow
|
||||||
|
|
||||||
|
1. **Sender broadcasts** `CLUSTER_DISCOVERY`
|
||||||
|
2. **Each receiver responds** with `CLUSTER_RESPONSE:<hostname>` to the sender IP
|
||||||
|
3. **Sender registers/updates** the node using hostname and source IP
|
||||||
|
|
||||||
|
### Heartbeat Flow
|
||||||
|
|
||||||
|
1. **A node broadcasts** `CLUSTER_HEARTBEAT:<hostname>`
|
||||||
|
2. **Each receiver replies** with `CLUSTER_NODE_INFO:<hostname>:<json>` to the heartbeat sender IP
|
||||||
|
3. **The sender**:
|
||||||
|
- Ensures the node exists or creates it with `hostname` and sender IP
|
||||||
|
- Parses JSON and updates resources, labels, `status = ACTIVE`, `lastSeen = now`
|
||||||
|
- Sets `latency = now - lastHeartbeatSentAt` (per-node, measured at heartbeat origin)
|
||||||
|
|
||||||
|
### Listener Behavior
|
||||||
|
|
||||||
|
The `cluster_listen` task parses one UDP packet per run and dispatches by prefix to:
|
||||||
|
- **Discovery** → send `CLUSTER_RESPONSE`
|
||||||
|
- **Heartbeat** → send `CLUSTER_NODE_INFO` JSON
|
||||||
|
- **Response** → add/update node using provided hostname and source IP
|
||||||
|
- **Node Info** → update resources/status/labels and record latency
|
||||||
|
|
||||||
|
### Timing and Intervals
|
||||||
|
|
||||||
|
- **UDP Port**: `Config.udp_port` (default 4210)
|
||||||
|
- **Discovery Interval**: `Config.discovery_interval_ms` (default 1000 ms)
|
||||||
|
- **Listen Interval**: `Config.cluster_listen_interval_ms` (default 10 ms)
|
||||||
|
- **Heartbeat Interval**: `Config.heartbeat_interval_ms` (default 5000 ms)
|
||||||
|
|
||||||
### Node Status Categories
|
### Node Status Categories
|
||||||
|
|
||||||
Nodes are automatically categorized by their activity:
|
Nodes are automatically categorized by their activity:
|
||||||
|
|
||||||
- **ACTIVE**: Responding within 10 seconds
|
- **ACTIVE**: lastSeen < `node_inactive_threshold_ms` (default 10s)
|
||||||
- **INACTIVE**: No response for 10-60 seconds
|
- **INACTIVE**: < `node_dead_threshold_ms` (default 120s)
|
||||||
- **DEAD**: No response for over 60 seconds
|
- **DEAD**: ≥ `node_dead_threshold_ms`
|
||||||
|
|
||||||
## Task Scheduling System
|
## Task Scheduling System
|
||||||
|
|
||||||
@@ -68,14 +116,14 @@ The system runs several background tasks at different intervals:
|
|||||||
|
|
||||||
### Core System Tasks
|
### Core System Tasks
|
||||||
|
|
||||||
| Task | Interval | Purpose |
|
| Task | Interval (default) | Purpose |
|
||||||
|------|----------|---------|
|
|------|--------------------|---------|
|
||||||
| **Discovery Send** | 1 second | Send UDP discovery packets |
|
| `cluster_discovery` | 1000 ms | Send UDP discovery packets |
|
||||||
| **Discovery Listen** | 100ms | Listen for discovery responses |
|
| `cluster_listen` | 10 ms | Listen for discovery/heartbeat/node-info |
|
||||||
| **Status Updates** | 1 second | Monitor cluster member health |
|
| `status_update` | 1000 ms | Update node status categories, purge dead |
|
||||||
| **Heartbeat** | 2 seconds | Maintain cluster connectivity |
|
| `heartbeat` | 5000 ms | Broadcast heartbeat and update local resources |
|
||||||
| **Member Info** | 10 seconds | Update detailed node information |
|
| `cluster_update_members_info` | 10000 ms | Reserved; no-op (info via UDP) |
|
||||||
| **Debug Output** | 5 seconds | Print cluster status |
|
| `print_members` | 5000 ms | Log current member list |
|
||||||
|
|
||||||
### Task Management Features
|
### Task Management Features
|
||||||
|
|
||||||
@@ -112,10 +160,7 @@ ctx.fire("cluster_updated", &clusterData);
|
|||||||
|
|
||||||
### Available Events
|
### Available Events
|
||||||
|
|
||||||
- **`node_discovered`**: New node added to cluster
|
- **`node_discovered`**: New node added or local node refreshed
|
||||||
- **`cluster_updated`**: Cluster membership changed
|
|
||||||
- **`resource_update`**: Node resources updated
|
|
||||||
- **`health_check`**: Node health status changed
|
|
||||||
|
|
||||||
## Resource Monitoring
|
## Resource Monitoring
|
||||||
|
|
||||||
@@ -155,10 +200,8 @@ The system includes automatic WiFi fallback for robust operation:
|
|||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
- **SSID Format**: `SPORE_<MAC_LAST_4>`
|
- **Hostname**: Derived from MAC (`esp-<mac>`) and assigned to `ctx.hostname`
|
||||||
- **Password**: Configurable fallback password
|
- **AP Mode**: If STA connection fails, device switches to AP mode with configured SSID/password
|
||||||
- **IP Range**: 192.168.4.x subnet
|
|
||||||
- **Gateway**: 192.168.4.1
|
|
||||||
|
|
||||||
## Cluster Topology
|
## Cluster Topology
|
||||||
|
|
||||||
@@ -170,32 +213,30 @@ The system includes automatic WiFi fallback for robust operation:
|
|||||||
|
|
||||||
### Network Architecture
|
### Network Architecture
|
||||||
|
|
||||||
- **Mesh-like Structure**: Nodes can communicate with each other
|
- UDP broadcast-based discovery and heartbeats on local subnet
|
||||||
- **Dynamic Routing**: Automatic path discovery between nodes
|
- Optional HTTP polling (disabled by default; node info exchanged via UDP)
|
||||||
- **Load Distribution**: Tasks distributed across available nodes
|
|
||||||
- **Fault Tolerance**: Automatic failover and recovery
|
|
||||||
|
|
||||||
## Data Flow
|
## Data Flow
|
||||||
|
|
||||||
### Node Discovery
|
### Node Discovery
|
||||||
1. **UDP Broadcast**: Nodes broadcast discovery packets on port 4210
|
1. **UDP Broadcast**: Nodes broadcast discovery packets on port 4210
|
||||||
2. **UDP Response**: Receiving nodes responds with hostname
|
2. **UDP Response**: Receiving nodes respond with hostname
|
||||||
3. **Registration**: Discovered nodes are added to local cluster member list
|
3. **Registration**: Discovered nodes are added to local cluster member list
|
||||||
|
|
||||||
### Health Monitoring
|
### Health Monitoring
|
||||||
1. **Periodic Checks**: Cluster manager polls member nodes every 1 second
|
1. **Periodic Checks**: Cluster manager updates node status categories
|
||||||
2. **Status Collection**: Each node returns resource usage and health metrics
|
2. **Status Collection**: Each node updates resources via UDP node-info messages
|
||||||
|
|
||||||
### Task Management
|
### Task Management
|
||||||
1. **Scheduling**: TaskScheduler executes registered tasks at configured intervals
|
1. **Scheduling**: `TaskManager` executes registered tasks at configured intervals
|
||||||
2. **Execution**: Tasks run cooperatively, yielding control to other tasks
|
2. **Execution**: Tasks run cooperatively in the main loop without preemption
|
||||||
3. **Monitoring**: Task status and results are exposed via REST API endpoints
|
3. **Monitoring**: Task status is exposed via REST (`/api/tasks/status`)
|
||||||
|
|
||||||
## Performance Characteristics
|
## Performance Characteristics
|
||||||
|
|
||||||
### Memory Usage
|
### Memory Usage
|
||||||
|
|
||||||
- **Base System**: ~15-20KB RAM
|
- **Base System**: ~15-20KB RAM (device dependent)
|
||||||
- **Per Task**: ~100-200 bytes per task
|
- **Per Task**: ~100-200 bytes per task
|
||||||
- **Cluster Members**: ~50-100 bytes per member
|
- **Cluster Members**: ~50-100 bytes per member
|
||||||
- **API Endpoints**: ~20-30 bytes per endpoint
|
- **API Endpoints**: ~20-30 bytes per endpoint
|
||||||
@@ -219,7 +260,7 @@ The system includes automatic WiFi fallback for robust operation:
|
|||||||
### Current Implementation
|
### Current Implementation
|
||||||
|
|
||||||
- **Network Access**: Local network only (no internet exposure)
|
- **Network Access**: Local network only (no internet exposure)
|
||||||
- **Authentication**: None currently implemented
|
- **Authentication**: None currently implemented; LAN-only access assumed
|
||||||
- **Data Validation**: Basic input validation
|
- **Data Validation**: Basic input validation
|
||||||
- **Resource Limits**: Memory and processing constraints
|
- **Resource Limits**: Memory and processing constraints
|
||||||
|
|
||||||
|
|||||||
@@ -20,19 +20,27 @@
|
|||||||
|
|
||||||
```
|
```
|
||||||
spore/
|
spore/
|
||||||
├── src/ # Source code
|
├── src/ # Source code (framework under src/spore)
|
||||||
│ ├── main.cpp # Main application entry point
|
│ └── spore/
|
||||||
│ ├── ApiServer.cpp # HTTP API server implementation
|
│ ├── Spore.cpp # Framework lifecycle (setup/begin/loop)
|
||||||
│ ├── ClusterManager.cpp # Cluster management logic
|
│ ├── core/ # Core components
|
||||||
│ ├── NetworkManager.cpp # WiFi and network handling
|
│ │ ├── ApiServer.cpp # HTTP API server implementation
|
||||||
│ ├── TaskManager.cpp # Background task management
|
│ │ ├── ClusterManager.cpp # Cluster management logic
|
||||||
│ └── NodeContext.cpp # Central context and events
|
│ │ ├── NetworkManager.cpp # WiFi and network handling
|
||||||
|
│ │ ├── TaskManager.cpp # Background task management
|
||||||
|
│ │ └── NodeContext.cpp # Central context and events
|
||||||
|
│ ├── services/ # Built-in services
|
||||||
|
│ │ ├── NodeService.cpp
|
||||||
|
│ │ ├── NetworkService.cpp
|
||||||
|
│ │ ├── ClusterService.cpp
|
||||||
|
│ │ ├── TaskService.cpp
|
||||||
|
│ │ ├── StaticFileService.cpp
|
||||||
|
│ │ └── MonitoringService.cpp
|
||||||
|
│ └── types/ # Shared types
|
||||||
├── include/ # Header files
|
├── include/ # Header files
|
||||||
├── lib/ # Library files
|
├── examples/ # Example apps per env (base, relay, neopattern)
|
||||||
├── docs/ # Documentation
|
├── docs/ # Documentation
|
||||||
├── api/ # OpenAPI specification
|
├── api/ # OpenAPI specification
|
||||||
├── examples/ # Example code
|
|
||||||
├── test/ # Test files
|
|
||||||
├── platformio.ini # PlatformIO configuration
|
├── platformio.ini # PlatformIO configuration
|
||||||
└── ctl.sh # Build and deployment scripts
|
└── ctl.sh # Build and deployment scripts
|
||||||
```
|
```
|
||||||
@@ -41,36 +49,70 @@ spore/
|
|||||||
|
|
||||||
### Framework and Board
|
### Framework and Board
|
||||||
|
|
||||||
The project uses PlatformIO with the following configuration:
|
The project uses PlatformIO with the following configuration (excerpt):
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
[env:esp01_1m]
|
[platformio]
|
||||||
|
default_envs = base
|
||||||
|
src_dir = .
|
||||||
|
data_dir = ${PROJECT_DIR}/examples/${PIOENV}/data
|
||||||
|
|
||||||
|
[common]
|
||||||
|
monitor_speed = 115200
|
||||||
|
lib_deps =
|
||||||
|
esp32async/ESPAsyncWebServer@^3.8.0
|
||||||
|
bblanchon/ArduinoJson@^7.4.2
|
||||||
|
|
||||||
|
[env:base]
|
||||||
platform = platformio/espressif8266@^4.2.1
|
platform = platformio/espressif8266@^4.2.1
|
||||||
board = esp01_1m
|
board = esp01_1m
|
||||||
framework = arduino
|
framework = arduino
|
||||||
upload_speed = 115200
|
upload_speed = 115200
|
||||||
flash_mode = dout
|
monitor_speed = 115200
|
||||||
|
board_build.f_cpu = 80000000L
|
||||||
|
board_build.flash_mode = qio
|
||||||
|
board_build.filesystem = littlefs
|
||||||
|
; note: somehow partition table is not working, so we need to use the ldscript
|
||||||
|
board_build.ldscript = eagle.flash.1m64.ld
|
||||||
|
lib_deps = ${common.lib_deps}
|
||||||
|
build_src_filter =
|
||||||
|
+<examples/base/*.cpp>
|
||||||
|
+<src/spore/*.cpp>
|
||||||
|
+<src/spore/core/*.cpp>
|
||||||
|
+<src/spore/services/*.cpp>
|
||||||
|
+<src/spore/types/*.cpp>
|
||||||
|
+<src/spore/util/*.cpp>
|
||||||
|
+<src/internal/*.cpp>
|
||||||
|
|
||||||
|
[env:d1_mini]
|
||||||
|
platform = platformio/espressif8266@^4.2.1
|
||||||
|
board = d1_mini
|
||||||
|
framework = arduino
|
||||||
|
upload_speed = 115200
|
||||||
|
monitor_speed = 115200
|
||||||
|
board_build.filesystem = littlefs
|
||||||
|
board_build.flash_mode = dio ; D1 Mini uses DIO on 4 Mbit flash
|
||||||
|
board_build.flash_size = 4M
|
||||||
|
board_build.ldscript = eagle.flash.4m1m.ld
|
||||||
|
lib_deps = ${common.lib_deps}
|
||||||
|
build_src_filter =
|
||||||
|
+<examples/base/*.cpp>
|
||||||
|
+<src/spore/*.cpp>
|
||||||
|
+<src/spore/core/*.cpp>
|
||||||
|
+<src/spore/services/*.cpp>
|
||||||
|
+<src/spore/types/*.cpp>
|
||||||
|
+<src/spore/util/*.cpp>
|
||||||
|
+<src/internal/*.cpp>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Key Configuration Details
|
|
||||||
|
|
||||||
- **Framework**: Arduino
|
|
||||||
- **Board**: ESP-01 with 1MB flash
|
|
||||||
- **Upload Speed**: 115200 baud
|
|
||||||
- **Flash Mode**: DOUT (required for ESP-01S)
|
|
||||||
- **Build Type**: Release (optimized for production)
|
|
||||||
|
|
||||||
### Dependencies
|
### Dependencies
|
||||||
|
|
||||||
The project requires the following libraries:
|
The project requires the following libraries (resolved via PlatformIO):
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
lib_deps =
|
lib_deps =
|
||||||
esp32async/ESPAsyncWebServer@^3.8.0
|
esp32async/ESPAsyncWebServer@^3.8.0
|
||||||
bblanchon/ArduinoJson@^7.4.2
|
bblanchon/ArduinoJson@^7.4.2
|
||||||
arkhipenko/TaskScheduler@^3.8.5
|
|
||||||
ESP8266HTTPClient@1.2
|
|
||||||
ESP8266WiFi@1.0
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Filesystem, Linker Scripts, and Flash Layout
|
### Filesystem, Linker Scripts, and Flash Layout
|
||||||
@@ -103,7 +145,6 @@ Notes:
|
|||||||
- If you need a different FS size, select an appropriate ldscript variant and keep `board_build.filesystem = littlefs`.
|
- If you need a different FS size, select an appropriate ldscript variant and keep `board_build.filesystem = littlefs`.
|
||||||
- On ESP8266, custom partition CSVs are not used for layout; the linker script defines the flash map. This project removed prior `board_build.partitions` usage in favor of explicit `board_build.ldscript` entries per environment.
|
- On ESP8266, custom partition CSVs are not used for layout; the linker script defines the flash map. This project removed prior `board_build.partitions` usage in favor of explicit `board_build.ldscript` entries per environment.
|
||||||
|
|
||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
|
||||||
### Basic Build Commands
|
### Basic Build Commands
|
||||||
@@ -308,7 +349,7 @@ export API_NODE=192.168.1.100
|
|||||||
Key configuration files:
|
Key configuration files:
|
||||||
|
|
||||||
- **`platformio.ini`**: Build and upload configuration
|
- **`platformio.ini`**: Build and upload configuration
|
||||||
- **`src/Config.cpp`**: Application configuration
|
- **`src/spore/types/Config.cpp`**: Default runtime configuration
|
||||||
- **`.env`**: Environment variables
|
- **`.env`**: Environment variables
|
||||||
- **`ctl.sh`**: Build and deployment scripts
|
- **`ctl.sh`**: Build and deployment scripts
|
||||||
|
|
||||||
|
|||||||
79
docs/MonitoringService.md
Normal file
79
docs/MonitoringService.md
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# Monitoring Service
|
||||||
|
|
||||||
|
Exposes system resource metrics via HTTP for observability.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
- **Service name**: `MonitoringService`
|
||||||
|
- **Endpoint**: `GET /api/monitoring/resources`
|
||||||
|
- **Metrics**: CPU usage, memory, filesystem, uptime
|
||||||
|
|
||||||
|
## Endpoint
|
||||||
|
|
||||||
|
### GET /api/monitoring/resources
|
||||||
|
|
||||||
|
Returns real-time system resource metrics.
|
||||||
|
|
||||||
|
Response fields:
|
||||||
|
- `cpu.current_usage`: Current CPU usage percent
|
||||||
|
- `cpu.average_usage`: Average CPU usage percent
|
||||||
|
- `cpu.max_usage`: Max observed CPU usage
|
||||||
|
- `cpu.min_usage`: Min observed CPU usage
|
||||||
|
- `cpu.measurement_count`: Number of measurements
|
||||||
|
- `cpu.is_measuring`: Whether measurement is active
|
||||||
|
- `memory.free_heap`: Free heap bytes
|
||||||
|
- `memory.total_heap`: Total heap bytes (approximate)
|
||||||
|
- `memory.min_free_heap`: Minimum free heap (0 on ESP8266)
|
||||||
|
- `memory.max_alloc_heap`: Max allocatable heap (0 on ESP8266)
|
||||||
|
- `memory.heap_fragmentation`: Fragmentation percent (0 on ESP8266)
|
||||||
|
- `filesystem.total_bytes`: LittleFS total bytes
|
||||||
|
- `filesystem.used_bytes`: Used bytes
|
||||||
|
- `filesystem.free_bytes`: Free bytes
|
||||||
|
- `filesystem.usage_percent`: Usage percent
|
||||||
|
- `system.uptime_ms`: Uptime in milliseconds
|
||||||
|
- `system.uptime_seconds`: Uptime in seconds
|
||||||
|
- `system.uptime_formatted`: Human-readable uptime
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"cpu": {
|
||||||
|
"current_usage": 3.5,
|
||||||
|
"average_usage": 2.1,
|
||||||
|
"max_usage": 15.2,
|
||||||
|
"min_usage": 0.0,
|
||||||
|
"measurement_count": 120,
|
||||||
|
"is_measuring": true
|
||||||
|
},
|
||||||
|
"memory": {
|
||||||
|
"free_heap": 48748,
|
||||||
|
"total_heap": 81920,
|
||||||
|
"min_free_heap": 0,
|
||||||
|
"max_alloc_heap": 0,
|
||||||
|
"heap_fragmentation": 0,
|
||||||
|
"heap_usage_percent": 40.4
|
||||||
|
},
|
||||||
|
"filesystem": {
|
||||||
|
"total_bytes": 65536,
|
||||||
|
"used_bytes": 10240,
|
||||||
|
"free_bytes": 55296,
|
||||||
|
"usage_percent": 15.6
|
||||||
|
},
|
||||||
|
"system": {
|
||||||
|
"uptime_ms": 123456,
|
||||||
|
"uptime_seconds": 123,
|
||||||
|
"uptime_formatted": "0h 2m 3s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
- `MonitoringService` reads from `CpuUsage` and ESP8266 SDK APIs.
|
||||||
|
- Filesystem metrics are gathered from LittleFS.
|
||||||
|
- CPU measurement is bracketed by `Spore::loop()` calling `cpuUsage.startMeasurement()` and `cpuUsage.endMeasurement()`.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- If `filesystem.total_bytes` is zero, ensure LittleFS is enabled in `platformio.ini` and an FS image is uploaded.
|
||||||
|
- CPU usage values remain zero until the main loop runs and CPU measurement is started.
|
||||||
@@ -15,15 +15,8 @@ Complete API reference with detailed endpoint documentation, examples, and integ
|
|||||||
- Task management workflows
|
- Task management workflows
|
||||||
- Cluster monitoring examples
|
- Cluster monitoring examples
|
||||||
|
|
||||||
### 📖 [TaskManager.md](./TaskManager.md)
|
### 📖 [MonitoringService.md](./MonitoringService.md)
|
||||||
Comprehensive guide to the TaskManager system for background task management.
|
System resource monitoring API for CPU, memory, filesystem, and uptime.
|
||||||
|
|
||||||
**Includes:**
|
|
||||||
- Basic usage examples
|
|
||||||
- Advanced binding techniques
|
|
||||||
- Task status monitoring
|
|
||||||
- API integration details
|
|
||||||
- Performance considerations
|
|
||||||
|
|
||||||
### 📖 [TaskManagement.md](./TaskManagement.md)
|
### 📖 [TaskManagement.md](./TaskManagement.md)
|
||||||
Complete guide to the task management system with examples and best practices.
|
Complete guide to the task management system with examples and best practices.
|
||||||
|
|||||||
@@ -319,18 +319,18 @@ curl -X POST http://192.168.1.100/api/tasks/control \
|
|||||||
### Before (with wrapper functions):
|
### Before (with wrapper functions):
|
||||||
```cpp
|
```cpp
|
||||||
void discoverySendTask() { cluster.sendDiscovery(); }
|
void discoverySendTask() { cluster.sendDiscovery(); }
|
||||||
void discoveryListenTask() { cluster.listenForDiscovery(); }
|
void clusterListenTask() { cluster.listen(); }
|
||||||
|
|
||||||
taskManager.registerTask("discovery_send", interval, discoverySendTask);
|
taskManager.registerTask("discovery_send", interval, discoverySendTask);
|
||||||
taskManager.registerTask("discovery_listen", interval, discoveryListenTask);
|
taskManager.registerTask("cluster_listen", interval, clusterListenTask);
|
||||||
```
|
```
|
||||||
|
|
||||||
### After (with std::bind):
|
### After (with std::bind):
|
||||||
```cpp
|
```cpp
|
||||||
taskManager.registerTask("discovery_send", interval,
|
taskManager.registerTask("discovery_send", interval,
|
||||||
std::bind(&ClusterManager::sendDiscovery, &cluster));
|
std::bind(&ClusterManager::sendDiscovery, &cluster));
|
||||||
taskManager.registerTask("discovery_listen", interval,
|
taskManager.registerTask("cluster_listen", interval,
|
||||||
std::bind(&ClusterManager::listenForDiscovery, &cluster));
|
std::bind(&ClusterManager::listen, &cluster));
|
||||||
```
|
```
|
||||||
|
|
||||||
## Compatibility
|
## Compatibility
|
||||||
|
|||||||
@@ -7,13 +7,15 @@
|
|||||||
#include <ArduinoJson.h>
|
#include <ArduinoJson.h>
|
||||||
#include <ESP8266HTTPClient.h>
|
#include <ESP8266HTTPClient.h>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
class ClusterManager {
|
class ClusterManager {
|
||||||
public:
|
public:
|
||||||
ClusterManager(NodeContext& ctx, TaskManager& taskMgr);
|
ClusterManager(NodeContext& ctx, TaskManager& taskMgr);
|
||||||
void registerTasks();
|
void registerTasks();
|
||||||
void sendDiscovery();
|
void sendDiscovery();
|
||||||
void listenForDiscovery();
|
void listen();
|
||||||
void addOrUpdateNode(const String& nodeHost, IPAddress nodeIP);
|
void addOrUpdateNode(const String& nodeHost, IPAddress nodeIP);
|
||||||
void updateAllNodeStatuses();
|
void updateAllNodeStatuses();
|
||||||
void removeDeadNodes();
|
void removeDeadNodes();
|
||||||
@@ -26,4 +28,21 @@ public:
|
|||||||
private:
|
private:
|
||||||
NodeContext& ctx;
|
NodeContext& ctx;
|
||||||
TaskManager& taskManager;
|
TaskManager& taskManager;
|
||||||
|
struct MessageHandler {
|
||||||
|
bool (*predicate)(const char*);
|
||||||
|
std::function<void(const char*)> handle;
|
||||||
|
const char* name;
|
||||||
|
};
|
||||||
|
void initMessageHandlers();
|
||||||
|
void handleIncomingMessage(const char* incoming);
|
||||||
|
static bool isDiscoveryMsg(const char* msg);
|
||||||
|
static bool isHeartbeatMsg(const char* msg);
|
||||||
|
static bool isResponseMsg(const char* msg);
|
||||||
|
static bool isNodeInfoMsg(const char* msg);
|
||||||
|
void onDiscovery(const char* msg);
|
||||||
|
void onHeartbeat(const char* msg);
|
||||||
|
void onResponse(const char* msg);
|
||||||
|
void onNodeInfo(const char* msg);
|
||||||
|
unsigned long lastHeartbeatSentAt = 0;
|
||||||
|
std::vector<MessageHandler> messageHandlers;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -7,8 +7,11 @@
|
|||||||
namespace ClusterProtocol {
|
namespace ClusterProtocol {
|
||||||
constexpr const char* DISCOVERY_MSG = "CLUSTER_DISCOVERY";
|
constexpr const char* DISCOVERY_MSG = "CLUSTER_DISCOVERY";
|
||||||
constexpr const char* RESPONSE_MSG = "CLUSTER_RESPONSE";
|
constexpr const char* RESPONSE_MSG = "CLUSTER_RESPONSE";
|
||||||
|
constexpr const char* HEARTBEAT_MSG = "CLUSTER_HEARTBEAT";
|
||||||
|
constexpr const char* NODE_INFO_MSG = "CLUSTER_NODE_INFO";
|
||||||
constexpr uint16_t UDP_PORT = 4210;
|
constexpr uint16_t UDP_PORT = 4210;
|
||||||
constexpr size_t UDP_BUF_SIZE = 64;
|
// Increased buffer to accommodate node info JSON over UDP
|
||||||
|
constexpr size_t UDP_BUF_SIZE = 512;
|
||||||
constexpr const char* API_NODE_STATUS = "/api/node/status";
|
constexpr const char* API_NODE_STATUS = "/api/node/status";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ public:
|
|||||||
// Cluster Configuration
|
// Cluster Configuration
|
||||||
unsigned long discovery_interval_ms;
|
unsigned long discovery_interval_ms;
|
||||||
unsigned long heartbeat_interval_ms;
|
unsigned long heartbeat_interval_ms;
|
||||||
|
unsigned long cluster_listen_interval_ms;
|
||||||
unsigned long status_update_interval_ms;
|
unsigned long status_update_interval_ms;
|
||||||
unsigned long member_info_update_interval_ms;
|
unsigned long member_info_update_interval_ms;
|
||||||
unsigned long print_interval_ms;
|
unsigned long print_interval_ms;
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ struct NodeInfo {
|
|||||||
uint32_t cpuFreqMHz = 0;
|
uint32_t cpuFreqMHz = 0;
|
||||||
uint32_t flashChipSize = 0;
|
uint32_t flashChipSize = 0;
|
||||||
} resources;
|
} resources;
|
||||||
unsigned long latency = 0; // ms since lastSeen
|
unsigned long latency = 0; // ms from heartbeat broadcast to NODE_INFO receipt
|
||||||
std::vector<EndpointInfo> endpoints; // List of registered endpoints
|
std::vector<EndpointInfo> endpoints; // List of registered endpoints
|
||||||
std::map<String, String> labels; // Arbitrary node labels (key -> value)
|
std::map<String, String> labels; // Arbitrary node labels (key -> value)
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -10,15 +10,16 @@ ClusterManager::ClusterManager(NodeContext& ctx, TaskManager& taskMgr) : ctx(ctx
|
|||||||
});
|
});
|
||||||
// Register tasks
|
// Register tasks
|
||||||
registerTasks();
|
registerTasks();
|
||||||
|
initMessageHandlers();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ClusterManager::registerTasks() {
|
void ClusterManager::registerTasks() {
|
||||||
taskManager.registerTask("discovery_send", ctx.config.discovery_interval_ms, [this]() { sendDiscovery(); });
|
taskManager.registerTask("cluster_discovery", ctx.config.discovery_interval_ms, [this]() { sendDiscovery(); });
|
||||||
taskManager.registerTask("discovery_listen", ctx.config.discovery_interval_ms / 10, [this]() { listenForDiscovery(); });
|
taskManager.registerTask("cluster_listen", ctx.config.cluster_listen_interval_ms, [this]() { listen(); });
|
||||||
taskManager.registerTask("status_update", ctx.config.status_update_interval_ms, [this]() { updateAllNodeStatuses(); removeDeadNodes(); });
|
taskManager.registerTask("status_update", ctx.config.status_update_interval_ms, [this]() { updateAllNodeStatuses(); removeDeadNodes(); });
|
||||||
taskManager.registerTask("print_members", ctx.config.print_interval_ms, [this]() { printMemberList(); });
|
taskManager.registerTask("print_members", ctx.config.print_interval_ms, [this]() { printMemberList(); });
|
||||||
taskManager.registerTask("heartbeat", ctx.config.heartbeat_interval_ms, [this]() { heartbeatTaskCallback(); });
|
taskManager.registerTask("heartbeat", ctx.config.heartbeat_interval_ms, [this]() { heartbeatTaskCallback(); });
|
||||||
taskManager.registerTask("update_members_info", ctx.config.member_info_update_interval_ms, [this]() { updateAllMembersInfoTaskCallback(); });
|
taskManager.registerTask("cluster_update_members_info", ctx.config.member_info_update_interval_ms, [this]() { updateAllMembersInfoTaskCallback(); });
|
||||||
LOG_INFO("ClusterManager", "Registered all cluster tasks");
|
LOG_INFO("ClusterManager", "Registered all cluster tasks");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -29,26 +30,146 @@ void ClusterManager::sendDiscovery() {
|
|||||||
ctx.udp->endPacket();
|
ctx.udp->endPacket();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ClusterManager::listenForDiscovery() {
|
void ClusterManager::listen() {
|
||||||
int packetSize = ctx.udp->parsePacket();
|
int packetSize = ctx.udp->parsePacket();
|
||||||
if (packetSize) {
|
if (!packetSize) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char incoming[ClusterProtocol::UDP_BUF_SIZE];
|
char incoming[ClusterProtocol::UDP_BUF_SIZE];
|
||||||
int len = ctx.udp->read(incoming, ClusterProtocol::UDP_BUF_SIZE);
|
int len = ctx.udp->read(incoming, ClusterProtocol::UDP_BUF_SIZE);
|
||||||
if (len > 0) {
|
if (len <= 0) {
|
||||||
incoming[len] = 0;
|
return;
|
||||||
}
|
}
|
||||||
//LOG_DEBUG(ctx, "UDP", "Packet received: " + String(incoming));
|
incoming[len] = 0;
|
||||||
if (strcmp(incoming, ClusterProtocol::DISCOVERY_MSG) == 0) {
|
handleIncomingMessage(incoming);
|
||||||
//LOG_DEBUG(ctx, "UDP", "Discovery request from: " + ctx.udp->remoteIP().toString());
|
}
|
||||||
|
|
||||||
|
void ClusterManager::initMessageHandlers() {
|
||||||
|
messageHandlers.clear();
|
||||||
|
messageHandlers.push_back({ &ClusterManager::isDiscoveryMsg, [this](const char* msg){ this->onDiscovery(msg); }, "DISCOVERY" });
|
||||||
|
messageHandlers.push_back({ &ClusterManager::isHeartbeatMsg, [this](const char* msg){ this->onHeartbeat(msg); }, "HEARTBEAT" });
|
||||||
|
messageHandlers.push_back({ &ClusterManager::isResponseMsg, [this](const char* msg){ this->onResponse(msg); }, "RESPONSE" });
|
||||||
|
messageHandlers.push_back({ &ClusterManager::isNodeInfoMsg, [this](const char* msg){ this->onNodeInfo(msg); }, "NODE_INFO" });
|
||||||
|
}
|
||||||
|
|
||||||
|
void ClusterManager::handleIncomingMessage(const char* incoming) {
|
||||||
|
for (const auto& h : messageHandlers) {
|
||||||
|
if (h.predicate(incoming)) {
|
||||||
|
h.handle(incoming);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ClusterManager::isDiscoveryMsg(const char* msg) {
|
||||||
|
return strcmp(msg, ClusterProtocol::DISCOVERY_MSG) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ClusterManager::isHeartbeatMsg(const char* msg) {
|
||||||
|
return strncmp(msg, ClusterProtocol::HEARTBEAT_MSG, strlen(ClusterProtocol::HEARTBEAT_MSG)) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ClusterManager::isResponseMsg(const char* msg) {
|
||||||
|
return strncmp(msg, ClusterProtocol::RESPONSE_MSG, strlen(ClusterProtocol::RESPONSE_MSG)) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ClusterManager::isNodeInfoMsg(const char* msg) {
|
||||||
|
return strncmp(msg, ClusterProtocol::NODE_INFO_MSG, strlen(ClusterProtocol::NODE_INFO_MSG)) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ClusterManager::onDiscovery(const char* /*msg*/) {
|
||||||
ctx.udp->beginPacket(ctx.udp->remoteIP(), ctx.config.udp_port);
|
ctx.udp->beginPacket(ctx.udp->remoteIP(), ctx.config.udp_port);
|
||||||
String response = String(ClusterProtocol::RESPONSE_MSG) + ":" + ctx.hostname;
|
String response = String(ClusterProtocol::RESPONSE_MSG) + ":" + ctx.hostname;
|
||||||
ctx.udp->write(response.c_str());
|
ctx.udp->write(response.c_str());
|
||||||
ctx.udp->endPacket();
|
ctx.udp->endPacket();
|
||||||
//LOG_DEBUG(ctx, "UDP", "Sent response with hostname: " + ctx.hostname);
|
}
|
||||||
} else if (strncmp(incoming, ClusterProtocol::RESPONSE_MSG, strlen(ClusterProtocol::RESPONSE_MSG)) == 0) {
|
|
||||||
char* hostPtr = incoming + strlen(ClusterProtocol::RESPONSE_MSG) + 1;
|
void ClusterManager::onHeartbeat(const char* /*msg*/) {
|
||||||
|
JsonDocument doc;
|
||||||
|
doc["freeHeap"] = ESP.getFreeHeap();
|
||||||
|
doc["chipId"] = ESP.getChipId();
|
||||||
|
doc["sdkVersion"] = ESP.getSdkVersion();
|
||||||
|
doc["cpuFreqMHz"] = ESP.getCpuFreqMHz();
|
||||||
|
doc["flashChipSize"] = ESP.getFlashChipSize();
|
||||||
|
|
||||||
|
if (ctx.memberList) {
|
||||||
|
auto it = ctx.memberList->find(ctx.hostname);
|
||||||
|
if (it != ctx.memberList->end()) {
|
||||||
|
JsonObject labelsObj = doc["labels"].to<JsonObject>();
|
||||||
|
for (const auto& kv : it->second.labels) {
|
||||||
|
labelsObj[kv.first.c_str()] = kv.second;
|
||||||
|
}
|
||||||
|
} else if (!ctx.self.labels.empty()) {
|
||||||
|
JsonObject labelsObj = doc["labels"].to<JsonObject>();
|
||||||
|
for (const auto& kv : ctx.self.labels) {
|
||||||
|
labelsObj[kv.first.c_str()] = kv.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String json;
|
||||||
|
serializeJson(doc, json);
|
||||||
|
|
||||||
|
ctx.udp->beginPacket(ctx.udp->remoteIP(), ctx.config.udp_port);
|
||||||
|
String msg = String(ClusterProtocol::NODE_INFO_MSG) + ":" + ctx.hostname + ":" + json;
|
||||||
|
ctx.udp->write(msg.c_str());
|
||||||
|
ctx.udp->endPacket();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ClusterManager::onResponse(const char* msg) {
|
||||||
|
char* hostPtr = const_cast<char*>(msg) + strlen(ClusterProtocol::RESPONSE_MSG) + 1;
|
||||||
String nodeHost = String(hostPtr);
|
String nodeHost = String(hostPtr);
|
||||||
addOrUpdateNode(nodeHost, ctx.udp->remoteIP());
|
addOrUpdateNode(nodeHost, ctx.udp->remoteIP());
|
||||||
|
}
|
||||||
|
|
||||||
|
void ClusterManager::onNodeInfo(const char* msg) {
|
||||||
|
char* p = const_cast<char*>(msg) + strlen(ClusterProtocol::NODE_INFO_MSG) + 1;
|
||||||
|
char* hostEnd = strchr(p, ':');
|
||||||
|
if (hostEnd) {
|
||||||
|
*hostEnd = '\0';
|
||||||
|
const char* hostCStr = p;
|
||||||
|
const char* jsonCStr = hostEnd + 1;
|
||||||
|
|
||||||
|
String nodeHost = String(hostCStr);
|
||||||
|
IPAddress senderIP = ctx.udp->remoteIP();
|
||||||
|
|
||||||
|
addOrUpdateNode(nodeHost, senderIP);
|
||||||
|
|
||||||
|
JsonDocument doc;
|
||||||
|
DeserializationError err = deserializeJson(doc, jsonCStr);
|
||||||
|
if (!err) {
|
||||||
|
auto& memberList = *ctx.memberList;
|
||||||
|
auto it = memberList.find(nodeHost);
|
||||||
|
if (it != memberList.end()) {
|
||||||
|
NodeInfo& node = it->second;
|
||||||
|
node.resources.freeHeap = doc["freeHeap"] | node.resources.freeHeap;
|
||||||
|
node.resources.chipId = doc["chipId"] | node.resources.chipId;
|
||||||
|
{
|
||||||
|
const char* sdk = doc["sdkVersion"] | node.resources.sdkVersion.c_str();
|
||||||
|
node.resources.sdkVersion = sdk ? String(sdk) : node.resources.sdkVersion;
|
||||||
|
}
|
||||||
|
node.resources.cpuFreqMHz = doc["cpuFreqMHz"] | node.resources.cpuFreqMHz;
|
||||||
|
node.resources.flashChipSize = doc["flashChipSize"] | node.resources.flashChipSize;
|
||||||
|
node.status = NodeInfo::ACTIVE;
|
||||||
|
unsigned long now = millis();
|
||||||
|
node.lastSeen = now;
|
||||||
|
if (lastHeartbeatSentAt != 0) {
|
||||||
|
node.latency = now - lastHeartbeatSentAt;
|
||||||
|
}
|
||||||
|
|
||||||
|
node.labels.clear();
|
||||||
|
if (doc["labels"].is<JsonObject>()) {
|
||||||
|
JsonObject labelsObj = doc["labels"].as<JsonObject>();
|
||||||
|
for (JsonPair kvp : labelsObj) {
|
||||||
|
const char* key = kvp.key().c_str();
|
||||||
|
const char* value = labelsObj[kvp.key()];
|
||||||
|
node.labels[key] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOG_DEBUG("Cluster", String("Failed to parse NODE_INFO JSON from ") + senderIP.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -194,31 +315,18 @@ void ClusterManager::heartbeatTaskCallback() {
|
|||||||
updateLocalNodeResources();
|
updateLocalNodeResources();
|
||||||
ctx.fire("node_discovered", &node);
|
ctx.fire("node_discovered", &node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Broadcast heartbeat so peers can respond with their node info
|
||||||
|
lastHeartbeatSentAt = millis();
|
||||||
|
ctx.udp->beginPacket("255.255.255.255", ctx.config.udp_port);
|
||||||
|
String hb = String(ClusterProtocol::HEARTBEAT_MSG) + ":" + ctx.hostname;
|
||||||
|
ctx.udp->write(hb.c_str());
|
||||||
|
ctx.udp->endPacket();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ClusterManager::updateAllMembersInfoTaskCallback() {
|
void ClusterManager::updateAllMembersInfoTaskCallback() {
|
||||||
auto& memberList = *ctx.memberList;
|
// HTTP-based member info fetching disabled; node info is provided via UDP responses to heartbeats
|
||||||
|
// No-op to reduce network and memory usage
|
||||||
// Limit concurrent HTTP requests to prevent memory pressure
|
|
||||||
const size_t maxConcurrentRequests = ctx.config.max_concurrent_http_requests;
|
|
||||||
size_t requestCount = 0;
|
|
||||||
|
|
||||||
for (auto& pair : memberList) {
|
|
||||||
const NodeInfo& node = pair.second;
|
|
||||||
if (node.ip != ctx.localIP) {
|
|
||||||
// Only process a limited number of requests per cycle
|
|
||||||
if (requestCount >= maxConcurrentRequests) {
|
|
||||||
LOG_DEBUG("Cluster", "Limiting concurrent HTTP requests to prevent memory pressure");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
fetchNodeInfo(node.ip);
|
|
||||||
requestCount++;
|
|
||||||
|
|
||||||
// Add small delay between requests to prevent overwhelming the system
|
|
||||||
delay(100);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ClusterManager::updateAllNodeStatuses() {
|
void ClusterManager::updateAllNodeStatuses() {
|
||||||
|
|||||||
@@ -10,10 +10,11 @@ Config::Config() {
|
|||||||
api_server_port = 80;
|
api_server_port = 80;
|
||||||
|
|
||||||
// Cluster Configuration
|
// Cluster Configuration
|
||||||
discovery_interval_ms = 1000;
|
discovery_interval_ms = 1000; // TODO retire this in favor of heartbeat_interval_ms
|
||||||
heartbeat_interval_ms = 2000;
|
cluster_listen_interval_ms = 10;
|
||||||
|
heartbeat_interval_ms = 5000;
|
||||||
status_update_interval_ms = 1000;
|
status_update_interval_ms = 1000;
|
||||||
member_info_update_interval_ms = 10000;
|
member_info_update_interval_ms = 10000; // TODO retire this in favor of heartbeat_interval_ms
|
||||||
print_interval_ms = 5000;
|
print_interval_ms = 5000;
|
||||||
|
|
||||||
// Node Status Thresholds
|
// Node Status Thresholds
|
||||||
|
|||||||
Reference in New Issue
Block a user