From 27a7d5297e4d861809bf169048f72d196e587909 Mon Sep 17 00:00:00 2001 From: David Schlachter Date: Wed, 7 Jan 2026 18:02:48 -0500 Subject: Add basic parsing --- go.mod | 5 ++ go.sum | 2 + main.go | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 go.mod create mode 100644 go.sum create mode 100644 main.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..310a6aa --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module davidschlachter.com/french-wiktionary-flashcards + +go 1.24.1 + +require github.com/mattn/go-sqlite3 v1.14.33 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3de9741 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/mattn/go-sqlite3 v1.14.33 h1:A5blZ5ulQo2AtayQ9/limgHEkFreKj1Dv226a1K73s0= +github.com/mattn/go-sqlite3 v1.14.33/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= diff --git a/main.go b/main.go new file mode 100644 index 0000000..37516cb --- /dev/null +++ b/main.go @@ -0,0 +1,166 @@ +// This program looks up words fromm Wiktionary, and creates Anki flashcards +// from them. +package main + +import ( + "bufio" + "database/sql" + "encoding/json" + "fmt" + "html/template" + "log" + "os" + "strings" + + _ "github.com/mattn/go-sqlite3" +) + +const rawDictionary = "/home/david/work/french-wiktionary-flashcards/raw-wiktextract-data.jsonl" +const dictionary = "/home/david/work/french-wiktionary-flashcards/raw-wiktextract-data.sqlite3" + +func main() { + db, err := sql.Open("sqlite3", dictionary) + if err != nil { + log.Fatalf("opening DB (you probably need to touch '%s'): %s", dictionary, err) + } + defer db.Close() + + _, err = db.Exec("create table IF NOT EXISTS words (word text not null primary key, definition text);") + if err != nil { + log.Fatalf("creating table: %s", err) + } + + row := db.QueryRow(`SELECT count(*) as count from words`) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatalf("counting rows: %s", err) + } + if count == 0 { + if err = readDictionary(); err != nil { + log.Fatalf("failed to prepare dictionary: %s", err) + } + } + +} + +type rawDictionaryEntry struct { + Word string `json:"word"` + LangCode string `json:"lang_code"` + POS string `json:"pos_title"` + Etymology []string `json:"etymology_texts"` + Senses []sense `json:"senses"` + Sounds []sound `json:"sounds"` + Tags []string `json:"tags"` +} + +type sense struct { + Glosses []string `json:"glosses"` + Examples []example `json:"examples"` +} + +type example struct { + Text string `json:"text"` +} + +type sound struct { + IPA string `json:"ipa"` +} + +type templateReadyDictionaryEntry struct { + Word string + POS string + Etymology string + Senses []SenseForDictionaryEntry + Sound string + Gender string +} + +type SenseForDictionaryEntry struct { + Sense string + Example string +} + +func readDictionary() error { + log.Printf("preparing list of dictionary words...") + + // Set up the template + tmpl, err := template.New("entry").Parse(`

{{ .Word }} {{ .Sound }} {{ .POS }} {{ .Gender }}

+
    + {{ range .Senses}} +
  1. {{ .Sense }}
    • {{ .Example }}
  2. + {{ end }} +
`) + if err != nil { + panic(err) + } + + file, err := os.Open(rawDictionary) + if err != nil { + return fmt.Errorf("opening: %w", err) + } + defer file.Close() + + var line int + scanner := bufio.NewScanner(file) + + maxCapacity := 1_000_000 + buf := make([]byte, maxCapacity) + scanner.Buffer(buf, maxCapacity) + + for scanner.Scan() { + line++ + + if line%10000 == 0 && line > 1 { + log.Printf("processed %d lines", line) + } + + var result rawDictionaryEntry + json.Unmarshal([]byte(scanner.Text()), &result) + if result.LangCode != "fr" { + continue + } + + // Create the definition text. + entry := templateReadyDictionaryEntry{ + Word: result.Word, + POS: strings.ToLower(result.POS), + } + if len(result.Etymology) > 0 { + entry.Etymology = result.Etymology[0] + } + if len(result.Sounds) > 0 { + entry.Sound = result.Sounds[0].IPA + } + for _, r := range result.Tags { + var genders []string + if r == "masculine" || r == "feminine" { + genders = append(genders, r) + } + entry.Gender = strings.Join(genders, " / ") + } + for _, s := range result.Senses { + var example string + if len(s.Examples) > 0 { + example = s.Examples[0].Text + } + sense := strings.Join(s.Glosses, "; ") + entry.Senses = append(entry.Senses, SenseForDictionaryEntry{Sense: sense, Example: example}) + } + + out := strings.Builder{} + err := tmpl.Execute(&out, entry) + if err != nil { + return fmt.Errorf("failed to render: %w", err) + } + fmt.Printf("%s", out.String()) + + } + if err := scanner.Err(); err != nil { + return fmt.Errorf("scanning: %w", err) + } + + log.Printf("prepared %d dictionary entries", line) + + return nil +} -- cgit v1.2.3