logs go brrrrr

- fix deadlock in waitgroups
- batch inserts
- fix request_time mapping
This commit is contained in:
Neil Hanlon 2025-01-04 17:56:51 -05:00
parent 649b754be6
commit 77c266c43e
Signed by: neil
GPG key ID: 705BC21EC3C70F34
4 changed files with 243 additions and 122 deletions

2
.gitignore vendored
View file

@ -1,3 +1,3 @@
fastly_log_processor_errors.log process_fastly_logs_errors.log
logs.db logs.db
logs.db-journal logs.db-journal

View file

@ -29,6 +29,7 @@ func CreateLogTable() error {
service_id TEXT, service_id TEXT,
client_ip TEXT, client_ip TEXT,
request_method TEXT, request_method TEXT,
request_time DATETIME,
request_url TEXT, request_url TEXT,
protocol TEXT, protocol TEXT,
response_status INTEGER, response_status INTEGER,

358
main.go
View file

@ -2,187 +2,305 @@ package main
import ( import (
"bufio" "bufio"
"context"
"fmt" "fmt"
"io"
"log" "log"
"os" "os"
"path/filepath" "path/filepath"
"sync" "sync"
"io"
"context"
"github.com/urfave/cli/v3" "github.com/urfave/cli/v3"
"git.resf.org/infrastructure/process-fastly-logs/db" "git.resf.org/infrastructure/process-fastly-logs/db"
"git.resf.org/infrastructure/process-fastly-logs/parser"
"git.resf.org/infrastructure/process-fastly-logs/models" "git.resf.org/infrastructure/process-fastly-logs/models"
"git.resf.org/infrastructure/process-fastly-logs/parser"
) )
func init() { func init() {
logFile, err := os.OpenFile("process_fastly_logs_errors.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) logFile, err := os.OpenFile("process_fastly_logs_errors.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
if err != nil { if err != nil {
log.Println("Failed to open error log file:", err) log.Println("Failed to open error log file:", err)
os.Exit(1) os.Exit(1)
} }
log.SetOutput(io.MultiWriter(os.Stdout, logFile)) log.SetOutput(io.MultiWriter(os.Stdout, logFile))
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
} }
func main() { func main() {
cwd, err := os.Getwd() cwd, err := os.Getwd()
if err != nil { if err != nil {
log.Println(err) log.Println(err)
} }
cmd := &cli.Command{ cmd := &cli.Command{
Flags: []cli.Flag{ Flags: []cli.Flag{
&cli.StringFlag{ &cli.StringFlag{
Name: "logDir", Name: "logDir",
Value: cwd, Value: cwd,
Usage: "directory containing logs to process", Usage: "directory containing logs to process",
}, },
&cli.StringFlag{ &cli.StringFlag{
Name: "outDir", Name: "outDir",
Value: cwd, Value: cwd,
Usage: "directory to ouput database and program logs", Usage: "directory to ouput database and program logs",
}, },
}, },
Action: fastlyLogProcessor, Action: fastlyLogProcessor,
} }
if err := cmd.Run(context.Background(), os.Args); err != nil { if err := cmd.Run(context.Background(), os.Args); err != nil {
log.Fatal(err) log.Fatal(err)
} }
} }
// TODO: // TODO:
// - Anonymize requests // - Anonymize requests
// - Spit out in format Brian can use // - Spit out in format Brian can use
// - Profile? lol // - Profile? lol
// - More tests // - More tests
func fastlyLogProcessor(ctx context.Context, cmd *cli.Command) error { func fastlyLogProcessor(ctx context.Context, cmd *cli.Command) error {
if cmd.NArg() < 1 { if cmd.NArg() < 1 {
return cli.Exit("Must pass at least one directory to process logs of", 1) return cli.Exit("Must pass at least one directory to process logs of", 1)
} }
month := cmd.Args().Get(0) month := cmd.Args().Get(0)
logDir := cmd.String("logDir") logDir := cmd.String("logDir")
outDir := cmd.String("outDir") outDir := cmd.String("outDir")
// Initialize the database // Initialize the database
err := db.InitDB(fmt.Sprintf(filepath.Join(outDir, "logs.db"))) err := db.InitDB(fmt.Sprintf(filepath.Join(outDir, "logs.db")))
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
defer db.DB.Close() defer db.DB.Close()
// Create logs table if not exists // Create logs table if not exists
err = db.CreateLogTable() err = db.CreateLogTable()
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
walkLogDir := fmt.Sprintf(filepath.Join(logDir, month)) walkLogDir := fmt.Sprintf(filepath.Join(logDir, month))
// Create a WaitGroup // Create a WaitGroup
var wg sync.WaitGroup var wg sync.WaitGroup
// Define a semaphore channel to limit concurrency // Define a semaphore channel to limit concurrency
maxGoroutines := 10 // Adjust based on your system capabilities maxGoroutines := 10 // Adjust based on your system capabilities
semaphore := make(chan struct{}, maxGoroutines) semaphore := make(chan struct{}, maxGoroutines)
// Walk through the directory and process each log file concurrently // Walk through the directory and process each log file concurrently
err = filepath.Walk(walkLogDir, func(path string, info os.FileInfo, err error) error { err = filepath.Walk(walkLogDir, func(path string, info os.FileInfo, err error) error {
if err != nil { if err != nil {
return err return err
} }
if !info.IsDir() && filepath.Ext(path) == ".log" { if !info.IsDir() && filepath.Ext(path) == ".log" {
wg.Add(1) wg.Add(1)
// Acquire a token // Acquire a token
semaphore <- struct{}{} semaphore <- struct{}{}
go func(p string) { go func(p string) {
defer wg.Done() defer wg.Done()
defer func() { <-semaphore }() // Release the token defer func() { <-semaphore }() // Release the token
err := processLogFile(p) err := processLogFile(p)
if err != nil { if err != nil {
log.Printf("Error processing file %s: %v\n", p, err) log.Printf("Error processing file %s: %v\n", p, err)
} }
}(path) }(path)
} }
return nil return nil
}) })
if err != nil { if err != nil {
return err return err
} }
// Wait for all Goroutines to finish // Wait for all Goroutines to finish
wg.Wait() wg.Wait()
return nil return nil
} }
func processLogFile(filePath string) error { func processLogFile(filePath string) error {
file, err := os.Open(filePath) file, err := os.Open(filePath)
if err != nil { if err != nil {
return err return err
} }
defer file.Close() defer file.Close()
// Create channels // Create channels
linesChan := make(chan string, 100) linesChan := make(chan string, 1000)
entriesChan := make(chan *models.LogEntry, 100) entriesChan := make(chan *models.LogEntry, 1000)
errorsChan := make(chan error, 100) errorsChan := make(chan error, 1000)
var wg sync.WaitGroup var wgParsers sync.WaitGroup
// Start Goroutines to parse lines // Start Goroutines to parse lines
numParsers := 5 // Adjust as needed numParsers := 100 // Adjust as needed
for i := 0; i < numParsers; i++ { for i := 0; i < numParsers; i++ {
wg.Add(1) wgParsers.Add(1)
go func() { go func() {
defer wg.Done() defer wgParsers.Done()
for line := range linesChan { for line := range linesChan {
entry, err := parser.ParseLogLine(line) log.Println("parsing line")
if err != nil { entry, err := parser.ParseLogLine(line)
errorsChan <- fmt.Errorf("Error parsing line: %v", err) if err != nil {
continue errorsChan <- fmt.Errorf("Error parsing line: %v", err)
} continue
entriesChan <- entry }
} entriesChan <- entry
}() }
} }()
}
// Start a Goroutine to save entries to the database // Start a Goroutine to save entries to the database
wg.Add(1) var wgSaver sync.WaitGroup
wgSaver.Add(1)
go func() { go func() {
defer wg.Done() defer wgSaver.Done()
const batchSize = 1000 // Adjust the batch size as needed
var batch []*models.LogEntry
batch = make([]*models.LogEntry, 0, batchSize)
for entry := range entriesChan { for entry := range entriesChan {
err := entry.Save() batch = append(batch, entry)
if err != nil { if len(batch) >= batchSize {
errorsChan <- fmt.Errorf("Error saving entry: %v", err) if err := saveBatch(batch); err != nil {
errorsChan <- fmt.Errorf("Error saving batch: %v", err)
}
batch = batch[:0] // Reset the batch
}
}
// Save any remaining entries in the batch
if len(batch) > 0 {
if err := saveBatch(batch); err != nil {
errorsChan <- fmt.Errorf("Error saving batch: %v", err)
} }
} }
}() }()
// Read lines from the file and send to linesChan // Error handling
scanner := bufio.NewScanner(file) var errorList []error
for scanner.Scan() { var errorMutex sync.Mutex
linesChan <- scanner.Text() var wgErrors sync.WaitGroup
}
close(linesChan)
// Wait for parsing and saving to finish wgErrors.Add(1)
wg.Wait() go func() {
close(entriesChan) defer wgErrors.Done()
close(errorsChan)
if len(errorsChan) > 0 {
log.Printf("Encountered errors while processing file %s:\n", filePath)
for err := range errorsChan { for err := range errorsChan {
log.Println("handling error")
errorMutex.Lock()
errorList = append(errorList, err)
errorMutex.Unlock()
}
}()
// Read lines from the file and send to linesChan
scanner := bufio.NewScanner(file)
for scanner.Scan() {
linesChan <- scanner.Text()
}
close(linesChan) // Close linesChan after all lines have been sent
// Wait for parsing Goroutines to finish
wgParsers.Wait()
// Close entriesChan since no more entries will be sent
close(entriesChan)
// Wait for the saving Goroutine to finish
wgSaver.Wait()
// Close errorsChan after all senders have finished
close(errorsChan)
// Wait for the error handling Goroutine to finish
wgErrors.Wait()
// Log the errors
if len(errorList) > 0 {
log.Printf("Encountered errors while processing file %s:\n", filePath)
for _, err := range errorList {
log.Println(err) log.Println(err)
} }
} }
if err := scanner.Err(); err != nil { if err := scanner.Err(); err != nil {
return err
}
return nil
}
// saveBatch saves a batch of log entries to the database using a transaction
func saveBatch(entries []*models.LogEntry) error {
tx, err := db.DB.Begin()
if err != nil {
return err
}
stmt, err := tx.Prepare(`
INSERT INTO logs(
priority,
timestamp,
cache_server,
service_id,
client_ip,
request_method,
request_time,
request_url,
protocol,
response_status,
response_body_bytes,
host,
user_agent,
datacenter,
geo_city,
geo_continent_code,
geo_region,
start_time,
elapsed_time_usec,
is_hit,
cache_result
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`)
if err != nil {
tx.Rollback()
return err
}
defer stmt.Close()
for _, entry := range entries {
_, err := stmt.Exec(
entry.Priority,
entry.Timestamp,
entry.CacheServer,
entry.ServiceID,
entry.ClientIP,
entry.RequestMethod,
entry.RequestTime,
entry.RequestURL,
entry.Protocol,
entry.ResponseStatus,
entry.ResponseBodyBytes,
entry.Host,
entry.UserAgent,
entry.Datacenter,
entry.GeoCity,
entry.GeoContinentCode,
entry.GeoRegion,
entry.StartTime,
entry.ElapsedTimeUsec,
entry.IsHit,
entry.CacheResult,
)
if err != nil {
tx.Rollback()
return err
}
}
if err := tx.Commit(); err != nil {
tx.Rollback()
return err return err
} }
return nil return nil

View file

@ -37,6 +37,7 @@ func (entry *LogEntry) Save() error {
service_id, service_id,
client_ip, client_ip,
request_method, request_method,
request_time,
request_url, request_url,
protocol, protocol,
response_status, response_status,
@ -51,7 +52,7 @@ func (entry *LogEntry) Save() error {
elapsed_time_usec, elapsed_time_usec,
is_hit, is_hit,
cache_result cache_result
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
statement, err := db.DB.Prepare(insertSQL) statement, err := db.DB.Prepare(insertSQL)
if err != nil { if err != nil {
@ -64,6 +65,7 @@ func (entry *LogEntry) Save() error {
entry.ServiceID, entry.ServiceID,
entry.ClientIP, entry.ClientIP,
entry.RequestMethod, entry.RequestMethod,
entry.RequestTime,
entry.RequestURL, entry.RequestURL,
entry.Protocol, entry.Protocol,
entry.ResponseStatus, entry.ResponseStatus,