backend-service/internal/monitoring/monitoring.go

package monitoring

import (
	"context"
	"sync"
	"time"

	"attune-heart-therapy/internal/errors"
	"attune-heart-therapy/internal/logger"
)

// ErrorTracker tracks and monitors application errors
type ErrorTracker struct {
	log           *logger.Logger
	errorCounts   map[errors.ErrorCode]int64
	errorTimes    map[errors.ErrorCode][]time.Time
	mu            sync.RWMutex
	windowSize    time.Duration
	maxWindowSize int
}

// NewErrorTracker creates a new error tracker
func NewErrorTracker() *ErrorTracker {
	return &ErrorTracker{
		log:           logger.New("error_tracker"),
		errorCounts:   make(map[errors.ErrorCode]int64),
		errorTimes:    make(map[errors.ErrorCode][]time.Time),
		windowSize:    time.Hour,
		maxWindowSize: 1000, // Keep last 1000 occurrences per error type
	}
}

// TrackError records an error occurrence
func (et *ErrorTracker) TrackError(ctx context.Context, appErr *errors.AppError, userID string, endpoint string) {
	et.mu.Lock()
	defer et.mu.Unlock()

	now := time.Now()

	// Increment error count
	et.errorCounts[appErr.Code]++

	// Add timestamp to error times
	if _, exists := et.errorTimes[appErr.Code]; !exists {
		et.errorTimes[appErr.Code] = make([]time.Time, 0)
	}

	et.errorTimes[appErr.Code] = append(et.errorTimes[appErr.Code], now)

	// Keep only recent errors within window
	et.cleanupOldErrors(appErr.Code, now)

	// Log error with tracking information
	fields := map[string]interface{}{
		"error_code":   appErr.Code,
		"error_count":  et.errorCounts[appErr.Code],
		"recent_count": len(et.errorTimes[appErr.Code]),
		"endpoint":     endpoint,
		"http_status":  appErr.HTTPStatus,
	}

	if userID != "" {
		fields["user_id"] = userID
	}

	if traceID := ctx.Value("trace_id"); traceID != nil {
		fields["trace_id"] = traceID
	}

	// Add error fields if available
	if appErr.Fields != nil {
		for k, v := range appErr.Fields {
			fields["error_"+k] = v
		}
	}

	// Check for error rate spikes
	if et.isErrorSpike(appErr.Code) {
		et.log.Error("Error rate spike detected", appErr.Cause, fields)
		et.alertOnErrorSpike(appErr.Code, fields)
	} else if appErr.HTTPStatus >= 500 {
		et.log.Error("Server error tracked", appErr.Cause, fields)
	} else {
		et.log.Warn("Client error tracked", fields)
	}
}

// cleanupOldErrors removes errors outside the time window
func (et *ErrorTracker) cleanupOldErrors(code errors.ErrorCode, now time.Time) {
	cutoff := now.Add(-et.windowSize)
	times := et.errorTimes[code]

	// Find first index within window
	start := 0
	for i, t := range times {
		if t.After(cutoff) {
			start = i
			break
		}
	}

	// Keep only recent errors
	et.errorTimes[code] = times[start:]

	// Limit to max window size
	if len(et.errorTimes[code]) > et.maxWindowSize {
		et.errorTimes[code] = et.errorTimes[code][len(et.errorTimes[code])-et.maxWindowSize:]
	}
}

// isErrorSpike checks if there's an unusual spike in error rate
func (et *ErrorTracker) isErrorSpike(code errors.ErrorCode) bool {
	times := et.errorTimes[code]
	if len(times) < 10 {
		return false
	}

	// Check if we have more than 10 errors in the last 5 minutes
	fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
	recentCount := 0

	for _, t := range times {
		if t.After(fiveMinutesAgo) {
			recentCount++
		}
	}

	return recentCount >= 10
}

// alertOnErrorSpike handles error spike alerts
func (et *ErrorTracker) alertOnErrorSpike(code errors.ErrorCode, fields map[string]interface{}) {
	// In a production system, this would send alerts to monitoring systems
	// like PagerDuty, Slack, or email notifications
	et.log.Error("ALERT: Error spike detected", nil, map[string]interface{}{
		"alert_type":   "error_spike",
		"error_code":   code,
		"recent_count": len(et.errorTimes[code]),
		"details":      fields,
	})
}

// GetErrorStats returns error statistics
func (et *ErrorTracker) GetErrorStats() map[string]interface{} {
	et.mu.RLock()
	defer et.mu.RUnlock()

	stats := map[string]interface{}{
		"total_errors":  make(map[errors.ErrorCode]int64),
		"recent_errors": make(map[errors.ErrorCode]int),
		"error_rates":   make(map[errors.ErrorCode]float64),
	}

	now := time.Now()
	oneHourAgo := now.Add(-time.Hour)

	for code, count := range et.errorCounts {
		stats["total_errors"].(map[errors.ErrorCode]int64)[code] = count

		// Count recent errors
		recentCount := 0
		if times, exists := et.errorTimes[code]; exists {
			for _, t := range times {
				if t.After(oneHourAgo) {
					recentCount++
				}
			}
		}

		stats["recent_errors"].(map[errors.ErrorCode]int)[code] = recentCount
		stats["error_rates"].(map[errors.ErrorCode]float64)[code] = float64(recentCount) / 60.0 // errors per minute
	}

	return stats
}

// MetricsCollector collects application metrics
type MetricsCollector struct {
	log            *logger.Logger
	requestCounts  map[string]int64
	responseTimes  map[string][]time.Duration
	activeRequests int64
	mu             sync.RWMutex
}

// NewMetricsCollector creates a new metrics collector
func NewMetricsCollector() *MetricsCollector {
	return &MetricsCollector{
		log:           logger.New("metrics_collector"),
		requestCounts: make(map[string]int64),
		responseTimes: make(map[string][]time.Duration),
	}
}

// RecordRequest records a request metric
func (mc *MetricsCollector) RecordRequest(endpoint string, method string, statusCode int, duration time.Duration) {
	mc.mu.Lock()
	defer mc.mu.Unlock()

	key := method + " " + endpoint
	mc.requestCounts[key]++

	if _, exists := mc.responseTimes[key]; !exists {
		mc.responseTimes[key] = make([]time.Duration, 0)
	}

	mc.responseTimes[key] = append(mc.responseTimes[key], duration)

	// Keep only last 1000 response times per endpoint
	if len(mc.responseTimes[key]) > 1000 {
		mc.responseTimes[key] = mc.responseTimes[key][len(mc.responseTimes[key])-1000:]
	}

	// Log slow requests
	if duration > 5*time.Second {
		mc.log.Warn("Slow request detected", map[string]interface{}{
			"endpoint":    endpoint,
			"method":      method,
			"status_code": statusCode,
			"duration_ms": duration.Milliseconds(),
		})
	}
}

// IncrementActiveRequests increments the active request counter
func (mc *MetricsCollector) IncrementActiveRequests() {
	mc.mu.Lock()
	defer mc.mu.Unlock()
	mc.activeRequests++
}

// DecrementActiveRequests decrements the active request counter
func (mc *MetricsCollector) DecrementActiveRequests() {
	mc.mu.Lock()
	defer mc.mu.Unlock()
	if mc.activeRequests > 0 {
		mc.activeRequests--
	}
}

// GetMetrics returns collected metrics
func (mc *MetricsCollector) GetMetrics() map[string]interface{} {
	mc.mu.RLock()
	defer mc.mu.RUnlock()

	metrics := map[string]interface{}{
		"active_requests":    mc.activeRequests,
		"request_counts":     make(map[string]int64),
		"avg_response_times": make(map[string]float64),
	}

	// Copy request counts
	for k, v := range mc.requestCounts {
		metrics["request_counts"].(map[string]int64)[k] = v
	}

	// Calculate average response times
	for endpoint, times := range mc.responseTimes {
		if len(times) > 0 {
			var total time.Duration
			for _, t := range times {
				total += t
			}
			avg := total / time.Duration(len(times))
			metrics["avg_response_times"].(map[string]float64)[endpoint] = float64(avg.Milliseconds())
		}
	}

	return metrics
}

// Monitor provides comprehensive application monitoring
type Monitor struct {
	ErrorTracker     *ErrorTracker
	MetricsCollector *MetricsCollector
	log              *logger.Logger
}

// NewMonitor creates a new application monitor
func NewMonitor() *Monitor {
	return &Monitor{
		ErrorTracker:     NewErrorTracker(),
		MetricsCollector: NewMetricsCollector(),
		log:              logger.New("monitor"),
	}
}

// GetHealthStatus returns comprehensive health status
func (m *Monitor) GetHealthStatus() map[string]interface{} {
	errorStats := m.ErrorTracker.GetErrorStats()
	metrics := m.MetricsCollector.GetMetrics()

	status := map[string]interface{}{
		"status":    "healthy",
		"timestamp": time.Now().UTC().Format(time.RFC3339),
		"errors":    errorStats,
		"metrics":   metrics,
	}

	// Determine overall health based on error rates
	if recentErrors, ok := errorStats["recent_errors"].(map[errors.ErrorCode]int); ok {
		totalRecentErrors := 0
		for _, count := range recentErrors {
			totalRecentErrors += count
		}

		if totalRecentErrors > 100 { // More than 100 errors in the last hour
			status["status"] = "degraded"
		}

		if totalRecentErrors > 500 { // More than 500 errors in the last hour
			status["status"] = "unhealthy"
		}
	}

	return status
}

// Global monitor instance
var globalMonitor = NewMonitor()

// GetGlobalMonitor returns the global monitor instance
func GetGlobalMonitor() *Monitor {
	return globalMonitor
}