diff options
| author | David Schlachter <t480-debian-git@schlachter.ca> | 2026-01-07 18:02:48 -0500 |
|---|---|---|
| committer | David Schlachter <t480-debian-git@schlachter.ca> | 2026-01-07 18:02:48 -0500 |
| commit | 27a7d5297e4d861809bf169048f72d196e587909 (patch) | |
| tree | ce80af284f78149dbc50ecd7c85a00fd55d5f7b2 /main.go | |
Add basic parsing
Diffstat (limited to 'main.go')
| -rw-r--r-- | main.go | 166 |
1 files changed, 166 insertions, 0 deletions
@@ -0,0 +1,166 @@ +// This program looks up words fromm Wiktionary, and creates Anki flashcards +// from them. +package main + +import ( + "bufio" + "database/sql" + "encoding/json" + "fmt" + "html/template" + "log" + "os" + "strings" + + _ "github.com/mattn/go-sqlite3" +) + +const rawDictionary = "/home/david/work/french-wiktionary-flashcards/raw-wiktextract-data.jsonl" +const dictionary = "/home/david/work/french-wiktionary-flashcards/raw-wiktextract-data.sqlite3" + +func main() { + db, err := sql.Open("sqlite3", dictionary) + if err != nil { + log.Fatalf("opening DB (you probably need to touch '%s'): %s", dictionary, err) + } + defer db.Close() + + _, err = db.Exec("create table IF NOT EXISTS words (word text not null primary key, definition text);") + if err != nil { + log.Fatalf("creating table: %s", err) + } + + row := db.QueryRow(`SELECT count(*) as count from words`) + var count int + err = row.Scan(&count) + if err != nil { + log.Fatalf("counting rows: %s", err) + } + if count == 0 { + if err = readDictionary(); err != nil { + log.Fatalf("failed to prepare dictionary: %s", err) + } + } + +} + +type rawDictionaryEntry struct { + Word string `json:"word"` + LangCode string `json:"lang_code"` + POS string `json:"pos_title"` + Etymology []string `json:"etymology_texts"` + Senses []sense `json:"senses"` + Sounds []sound `json:"sounds"` + Tags []string `json:"tags"` +} + +type sense struct { + Glosses []string `json:"glosses"` + Examples []example `json:"examples"` +} + +type example struct { + Text string `json:"text"` +} + +type sound struct { + IPA string `json:"ipa"` +} + +type templateReadyDictionaryEntry struct { + Word string + POS string + Etymology string + Senses []SenseForDictionaryEntry + Sound string + Gender string +} + +type SenseForDictionaryEntry struct { + Sense string + Example string +} + +func readDictionary() error { + log.Printf("preparing list of dictionary words...") + + // Set up the template + tmpl, err := template.New("entry").Parse(`<p>{{ .Word }} {{ .Sound }} <i>{{ .POS }} {{ .Gender }}</i></p> + <ol> + {{ range .Senses}} + <li>{{ .Sense }}<br><ul><li><i>{{ .Example }}</i></li></ul></li> + {{ end }} + </ol>`) + if err != nil { + panic(err) + } + + file, err := os.Open(rawDictionary) + if err != nil { + return fmt.Errorf("opening: %w", err) + } + defer file.Close() + + var line int + scanner := bufio.NewScanner(file) + + maxCapacity := 1_000_000 + buf := make([]byte, maxCapacity) + scanner.Buffer(buf, maxCapacity) + + for scanner.Scan() { + line++ + + if line%10000 == 0 && line > 1 { + log.Printf("processed %d lines", line) + } + + var result rawDictionaryEntry + json.Unmarshal([]byte(scanner.Text()), &result) + if result.LangCode != "fr" { + continue + } + + // Create the definition text. + entry := templateReadyDictionaryEntry{ + Word: result.Word, + POS: strings.ToLower(result.POS), + } + if len(result.Etymology) > 0 { + entry.Etymology = result.Etymology[0] + } + if len(result.Sounds) > 0 { + entry.Sound = result.Sounds[0].IPA + } + for _, r := range result.Tags { + var genders []string + if r == "masculine" || r == "feminine" { + genders = append(genders, r) + } + entry.Gender = strings.Join(genders, " / ") + } + for _, s := range result.Senses { + var example string + if len(s.Examples) > 0 { + example = s.Examples[0].Text + } + sense := strings.Join(s.Glosses, "; ") + entry.Senses = append(entry.Senses, SenseForDictionaryEntry{Sense: sense, Example: example}) + } + + out := strings.Builder{} + err := tmpl.Execute(&out, entry) + if err != nil { + return fmt.Errorf("failed to render: %w", err) + } + fmt.Printf("%s", out.String()) + + } + if err := scanner.Err(); err != nil { + return fmt.Errorf("scanning: %w", err) + } + + log.Printf("prepared %d dictionary entries", line) + + return nil +} |
