package monitoring import ( "context" "sync" "time" "attune-heart-therapy/internal/errors" "attune-heart-therapy/internal/logger" ) // ErrorTracker tracks and monitors application errors type ErrorTracker struct { log *logger.Logger errorCounts map[errors.ErrorCode]int64 errorTimes map[errors.ErrorCode][]time.Time mu sync.RWMutex windowSize time.Duration maxWindowSize int } // NewErrorTracker creates a new error tracker func NewErrorTracker() *ErrorTracker { return &ErrorTracker{ log: logger.New("error_tracker"), errorCounts: make(map[errors.ErrorCode]int64), errorTimes: make(map[errors.ErrorCode][]time.Time), windowSize: time.Hour, maxWindowSize: 1000, // Keep last 1000 occurrences per error type } } // TrackError records an error occurrence func (et *ErrorTracker) TrackError(ctx context.Context, appErr *errors.AppError, userID string, endpoint string) { et.mu.Lock() defer et.mu.Unlock() now := time.Now() // Increment error count et.errorCounts[appErr.Code]++ // Add timestamp to error times if _, exists := et.errorTimes[appErr.Code]; !exists { et.errorTimes[appErr.Code] = make([]time.Time, 0) } et.errorTimes[appErr.Code] = append(et.errorTimes[appErr.Code], now) // Keep only recent errors within window et.cleanupOldErrors(appErr.Code, now) // Log error with tracking information fields := map[string]interface{}{ "error_code": appErr.Code, "error_count": et.errorCounts[appErr.Code], "recent_count": len(et.errorTimes[appErr.Code]), "endpoint": endpoint, "http_status": appErr.HTTPStatus, } if userID != "" { fields["user_id"] = userID } if traceID := ctx.Value("trace_id"); traceID != nil { fields["trace_id"] = traceID } // Add error fields if available if appErr.Fields != nil { for k, v := range appErr.Fields { fields["error_"+k] = v } } // Check for error rate spikes if et.isErrorSpike(appErr.Code) { et.log.Error("Error rate spike detected", appErr.Cause, fields) et.alertOnErrorSpike(appErr.Code, fields) } else if appErr.HTTPStatus >= 500 { et.log.Error("Server error tracked", appErr.Cause, fields) } else { et.log.Warn("Client error tracked", fields) } } // cleanupOldErrors removes errors outside the time window func (et *ErrorTracker) cleanupOldErrors(code errors.ErrorCode, now time.Time) { cutoff := now.Add(-et.windowSize) times := et.errorTimes[code] // Find first index within window start := 0 for i, t := range times { if t.After(cutoff) { start = i break } } // Keep only recent errors et.errorTimes[code] = times[start:] // Limit to max window size if len(et.errorTimes[code]) > et.maxWindowSize { et.errorTimes[code] = et.errorTimes[code][len(et.errorTimes[code])-et.maxWindowSize:] } } // isErrorSpike checks if there's an unusual spike in error rate func (et *ErrorTracker) isErrorSpike(code errors.ErrorCode) bool { times := et.errorTimes[code] if len(times) < 10 { return false } // Check if we have more than 10 errors in the last 5 minutes fiveMinutesAgo := time.Now().Add(-5 * time.Minute) recentCount := 0 for _, t := range times { if t.After(fiveMinutesAgo) { recentCount++ } } return recentCount >= 10 } // alertOnErrorSpike handles error spike alerts func (et *ErrorTracker) alertOnErrorSpike(code errors.ErrorCode, fields map[string]interface{}) { // In a production system, this would send alerts to monitoring systems // like PagerDuty, Slack, or email notifications et.log.Error("ALERT: Error spike detected", nil, map[string]interface{}{ "alert_type": "error_spike", "error_code": code, "recent_count": len(et.errorTimes[code]), "details": fields, }) } // GetErrorStats returns error statistics func (et *ErrorTracker) GetErrorStats() map[string]interface{} { et.mu.RLock() defer et.mu.RUnlock() stats := map[string]interface{}{ "total_errors": make(map[errors.ErrorCode]int64), "recent_errors": make(map[errors.ErrorCode]int), "error_rates": make(map[errors.ErrorCode]float64), } now := time.Now() oneHourAgo := now.Add(-time.Hour) for code, count := range et.errorCounts { stats["total_errors"].(map[errors.ErrorCode]int64)[code] = count // Count recent errors recentCount := 0 if times, exists := et.errorTimes[code]; exists { for _, t := range times { if t.After(oneHourAgo) { recentCount++ } } } stats["recent_errors"].(map[errors.ErrorCode]int)[code] = recentCount stats["error_rates"].(map[errors.ErrorCode]float64)[code] = float64(recentCount) / 60.0 // errors per minute } return stats } // MetricsCollector collects application metrics type MetricsCollector struct { log *logger.Logger requestCounts map[string]int64 responseTimes map[string][]time.Duration activeRequests int64 mu sync.RWMutex } // NewMetricsCollector creates a new metrics collector func NewMetricsCollector() *MetricsCollector { return &MetricsCollector{ log: logger.New("metrics_collector"), requestCounts: make(map[string]int64), responseTimes: make(map[string][]time.Duration), } } // RecordRequest records a request metric func (mc *MetricsCollector) RecordRequest(endpoint string, method string, statusCode int, duration time.Duration) { mc.mu.Lock() defer mc.mu.Unlock() key := method + " " + endpoint mc.requestCounts[key]++ if _, exists := mc.responseTimes[key]; !exists { mc.responseTimes[key] = make([]time.Duration, 0) } mc.responseTimes[key] = append(mc.responseTimes[key], duration) // Keep only last 1000 response times per endpoint if len(mc.responseTimes[key]) > 1000 { mc.responseTimes[key] = mc.responseTimes[key][len(mc.responseTimes[key])-1000:] } // Log slow requests if duration > 5*time.Second { mc.log.Warn("Slow request detected", map[string]interface{}{ "endpoint": endpoint, "method": method, "status_code": statusCode, "duration_ms": duration.Milliseconds(), }) } } // IncrementActiveRequests increments the active request counter func (mc *MetricsCollector) IncrementActiveRequests() { mc.mu.Lock() defer mc.mu.Unlock() mc.activeRequests++ } // DecrementActiveRequests decrements the active request counter func (mc *MetricsCollector) DecrementActiveRequests() { mc.mu.Lock() defer mc.mu.Unlock() if mc.activeRequests > 0 { mc.activeRequests-- } } // GetMetrics returns collected metrics func (mc *MetricsCollector) GetMetrics() map[string]interface{} { mc.mu.RLock() defer mc.mu.RUnlock() metrics := map[string]interface{}{ "active_requests": mc.activeRequests, "request_counts": make(map[string]int64), "avg_response_times": make(map[string]float64), } // Copy request counts for k, v := range mc.requestCounts { metrics["request_counts"].(map[string]int64)[k] = v } // Calculate average response times for endpoint, times := range mc.responseTimes { if len(times) > 0 { var total time.Duration for _, t := range times { total += t } avg := total / time.Duration(len(times)) metrics["avg_response_times"].(map[string]float64)[endpoint] = float64(avg.Milliseconds()) } } return metrics } // Monitor provides comprehensive application monitoring type Monitor struct { ErrorTracker *ErrorTracker MetricsCollector *MetricsCollector log *logger.Logger } // NewMonitor creates a new application monitor func NewMonitor() *Monitor { return &Monitor{ ErrorTracker: NewErrorTracker(), MetricsCollector: NewMetricsCollector(), log: logger.New("monitor"), } } // GetHealthStatus returns comprehensive health status func (m *Monitor) GetHealthStatus() map[string]interface{} { errorStats := m.ErrorTracker.GetErrorStats() metrics := m.MetricsCollector.GetMetrics() status := map[string]interface{}{ "status": "healthy", "timestamp": time.Now().UTC().Format(time.RFC3339), "errors": errorStats, "metrics": metrics, } // Determine overall health based on error rates if recentErrors, ok := errorStats["recent_errors"].(map[errors.ErrorCode]int); ok { totalRecentErrors := 0 for _, count := range recentErrors { totalRecentErrors += count } if totalRecentErrors > 100 { // More than 100 errors in the last hour status["status"] = "degraded" } if totalRecentErrors > 500 { // More than 500 errors in the last hour status["status"] = "unhealthy" } } return status } // Global monitor instance var globalMonitor = NewMonitor() // GetGlobalMonitor returns the global monitor instance func GetGlobalMonitor() *Monitor { return globalMonitor }