backend-service/internal/monitoring/monitoring.go
ats-tech25 ddfa2de49e feat(app): Implement comprehensive application lifecycle management
- Add new `app` package to manage application initialization and lifecycle
- Refactor `main.go` to use new application management approach
- Implement graceful shutdown with context timeout and signal handling
- Add dependency injection container initialization
- Enhance logging with configurable log levels and structured logging
- Update configuration loading and server initialization process
- Modify Jitsi configuration in `.env` for custom deployment
- Improve error handling and logging throughout application startup
- Centralize application startup and shutdown logic in single package
Introduces a more robust and flexible application management system with improved initialization, logging, and shutdown capabilities.
2025-11-07 19:22:26 +00:00

326 lines
8.4 KiB
Go

package monitoring
import (
"context"
"sync"
"time"
"attune-heart-therapy/internal/errors"
"attune-heart-therapy/internal/logger"
)
// ErrorTracker tracks and monitors application errors
type ErrorTracker struct {
log *logger.Logger
errorCounts map[errors.ErrorCode]int64
errorTimes map[errors.ErrorCode][]time.Time
mu sync.RWMutex
windowSize time.Duration
maxWindowSize int
}
// NewErrorTracker creates a new error tracker
func NewErrorTracker() *ErrorTracker {
return &ErrorTracker{
log: logger.New("error_tracker"),
errorCounts: make(map[errors.ErrorCode]int64),
errorTimes: make(map[errors.ErrorCode][]time.Time),
windowSize: time.Hour,
maxWindowSize: 1000, // Keep last 1000 occurrences per error type
}
}
// TrackError records an error occurrence
func (et *ErrorTracker) TrackError(ctx context.Context, appErr *errors.AppError, userID string, endpoint string) {
et.mu.Lock()
defer et.mu.Unlock()
now := time.Now()
// Increment error count
et.errorCounts[appErr.Code]++
// Add timestamp to error times
if _, exists := et.errorTimes[appErr.Code]; !exists {
et.errorTimes[appErr.Code] = make([]time.Time, 0)
}
et.errorTimes[appErr.Code] = append(et.errorTimes[appErr.Code], now)
// Keep only recent errors within window
et.cleanupOldErrors(appErr.Code, now)
// Log error with tracking information
fields := map[string]interface{}{
"error_code": appErr.Code,
"error_count": et.errorCounts[appErr.Code],
"recent_count": len(et.errorTimes[appErr.Code]),
"endpoint": endpoint,
"http_status": appErr.HTTPStatus,
}
if userID != "" {
fields["user_id"] = userID
}
if traceID := ctx.Value("trace_id"); traceID != nil {
fields["trace_id"] = traceID
}
// Add error fields if available
if appErr.Fields != nil {
for k, v := range appErr.Fields {
fields["error_"+k] = v
}
}
// Check for error rate spikes
if et.isErrorSpike(appErr.Code) {
et.log.Error("Error rate spike detected", appErr.Cause, fields)
et.alertOnErrorSpike(appErr.Code, fields)
} else if appErr.HTTPStatus >= 500 {
et.log.Error("Server error tracked", appErr.Cause, fields)
} else {
et.log.Warn("Client error tracked", fields)
}
}
// cleanupOldErrors removes errors outside the time window
func (et *ErrorTracker) cleanupOldErrors(code errors.ErrorCode, now time.Time) {
cutoff := now.Add(-et.windowSize)
times := et.errorTimes[code]
// Find first index within window
start := 0
for i, t := range times {
if t.After(cutoff) {
start = i
break
}
}
// Keep only recent errors
et.errorTimes[code] = times[start:]
// Limit to max window size
if len(et.errorTimes[code]) > et.maxWindowSize {
et.errorTimes[code] = et.errorTimes[code][len(et.errorTimes[code])-et.maxWindowSize:]
}
}
// isErrorSpike checks if there's an unusual spike in error rate
func (et *ErrorTracker) isErrorSpike(code errors.ErrorCode) bool {
times := et.errorTimes[code]
if len(times) < 10 {
return false
}
// Check if we have more than 10 errors in the last 5 minutes
fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
recentCount := 0
for _, t := range times {
if t.After(fiveMinutesAgo) {
recentCount++
}
}
return recentCount >= 10
}
// alertOnErrorSpike handles error spike alerts
func (et *ErrorTracker) alertOnErrorSpike(code errors.ErrorCode, fields map[string]interface{}) {
// In a production system, this would send alerts to monitoring systems
// like PagerDuty, Slack, or email notifications
et.log.Error("ALERT: Error spike detected", nil, map[string]interface{}{
"alert_type": "error_spike",
"error_code": code,
"recent_count": len(et.errorTimes[code]),
"details": fields,
})
}
// GetErrorStats returns error statistics
func (et *ErrorTracker) GetErrorStats() map[string]interface{} {
et.mu.RLock()
defer et.mu.RUnlock()
stats := map[string]interface{}{
"total_errors": make(map[errors.ErrorCode]int64),
"recent_errors": make(map[errors.ErrorCode]int),
"error_rates": make(map[errors.ErrorCode]float64),
}
now := time.Now()
oneHourAgo := now.Add(-time.Hour)
for code, count := range et.errorCounts {
stats["total_errors"].(map[errors.ErrorCode]int64)[code] = count
// Count recent errors
recentCount := 0
if times, exists := et.errorTimes[code]; exists {
for _, t := range times {
if t.After(oneHourAgo) {
recentCount++
}
}
}
stats["recent_errors"].(map[errors.ErrorCode]int)[code] = recentCount
stats["error_rates"].(map[errors.ErrorCode]float64)[code] = float64(recentCount) / 60.0 // errors per minute
}
return stats
}
// MetricsCollector collects application metrics
type MetricsCollector struct {
log *logger.Logger
requestCounts map[string]int64
responseTimes map[string][]time.Duration
activeRequests int64
mu sync.RWMutex
}
// NewMetricsCollector creates a new metrics collector
func NewMetricsCollector() *MetricsCollector {
return &MetricsCollector{
log: logger.New("metrics_collector"),
requestCounts: make(map[string]int64),
responseTimes: make(map[string][]time.Duration),
}
}
// RecordRequest records a request metric
func (mc *MetricsCollector) RecordRequest(endpoint string, method string, statusCode int, duration time.Duration) {
mc.mu.Lock()
defer mc.mu.Unlock()
key := method + " " + endpoint
mc.requestCounts[key]++
if _, exists := mc.responseTimes[key]; !exists {
mc.responseTimes[key] = make([]time.Duration, 0)
}
mc.responseTimes[key] = append(mc.responseTimes[key], duration)
// Keep only last 1000 response times per endpoint
if len(mc.responseTimes[key]) > 1000 {
mc.responseTimes[key] = mc.responseTimes[key][len(mc.responseTimes[key])-1000:]
}
// Log slow requests
if duration > 5*time.Second {
mc.log.Warn("Slow request detected", map[string]interface{}{
"endpoint": endpoint,
"method": method,
"status_code": statusCode,
"duration_ms": duration.Milliseconds(),
})
}
}
// IncrementActiveRequests increments the active request counter
func (mc *MetricsCollector) IncrementActiveRequests() {
mc.mu.Lock()
defer mc.mu.Unlock()
mc.activeRequests++
}
// DecrementActiveRequests decrements the active request counter
func (mc *MetricsCollector) DecrementActiveRequests() {
mc.mu.Lock()
defer mc.mu.Unlock()
if mc.activeRequests > 0 {
mc.activeRequests--
}
}
// GetMetrics returns collected metrics
func (mc *MetricsCollector) GetMetrics() map[string]interface{} {
mc.mu.RLock()
defer mc.mu.RUnlock()
metrics := map[string]interface{}{
"active_requests": mc.activeRequests,
"request_counts": make(map[string]int64),
"avg_response_times": make(map[string]float64),
}
// Copy request counts
for k, v := range mc.requestCounts {
metrics["request_counts"].(map[string]int64)[k] = v
}
// Calculate average response times
for endpoint, times := range mc.responseTimes {
if len(times) > 0 {
var total time.Duration
for _, t := range times {
total += t
}
avg := total / time.Duration(len(times))
metrics["avg_response_times"].(map[string]float64)[endpoint] = float64(avg.Milliseconds())
}
}
return metrics
}
// Monitor provides comprehensive application monitoring
type Monitor struct {
ErrorTracker *ErrorTracker
MetricsCollector *MetricsCollector
log *logger.Logger
}
// NewMonitor creates a new application monitor
func NewMonitor() *Monitor {
return &Monitor{
ErrorTracker: NewErrorTracker(),
MetricsCollector: NewMetricsCollector(),
log: logger.New("monitor"),
}
}
// GetHealthStatus returns comprehensive health status
func (m *Monitor) GetHealthStatus() map[string]interface{} {
errorStats := m.ErrorTracker.GetErrorStats()
metrics := m.MetricsCollector.GetMetrics()
status := map[string]interface{}{
"status": "healthy",
"timestamp": time.Now().UTC().Format(time.RFC3339),
"errors": errorStats,
"metrics": metrics,
}
// Determine overall health based on error rates
if recentErrors, ok := errorStats["recent_errors"].(map[errors.ErrorCode]int); ok {
totalRecentErrors := 0
for _, count := range recentErrors {
totalRecentErrors += count
}
if totalRecentErrors > 100 { // More than 100 errors in the last hour
status["status"] = "degraded"
}
if totalRecentErrors > 500 { // More than 500 errors in the last hour
status["status"] = "unhealthy"
}
}
return status
}
// Global monitor instance
var globalMonitor = NewMonitor()
// GetGlobalMonitor returns the global monitor instance
func GetGlobalMonitor() *Monitor {
return globalMonitor
}