diff options
| -rw-r--r-- | main.go | 133 | ||||
| -rw-r--r-- | setup.go | 147 |
2 files changed, 150 insertions, 130 deletions
@@ -3,14 +3,8 @@ package main import ( - "bufio" "database/sql" - "encoding/json" - "fmt" - "html/template" "log" - "os" - "strings" _ "github.com/mattn/go-sqlite3" ) @@ -21,11 +15,11 @@ const dictionary = "/home/david/work/french-wiktionary-flashcards/raw-wiktextrac func main() { db, err := sql.Open("sqlite3", dictionary) if err != nil { - log.Fatalf("opening DB (you probably need to touch '%s'): %s", dictionary, err) + log.Fatalf("opening DB '%s': %s", dictionary, err) } defer db.Close() - _, err = db.Exec("create table IF NOT EXISTS words (word text not null primary key, definition text);") + _, err = db.Exec("create table IF NOT EXISTS words (word text not null, definition text);") if err != nil { log.Fatalf("creating table: %s", err) } @@ -37,130 +31,9 @@ func main() { log.Fatalf("counting rows: %s", err) } if count == 0 { - if err = readDictionary(); err != nil { + if err = readDictionary(db); err != nil { log.Fatalf("failed to prepare dictionary: %s", err) } } } - -type rawDictionaryEntry struct { - Word string `json:"word"` - LangCode string `json:"lang_code"` - POS string `json:"pos_title"` - Etymology []string `json:"etymology_texts"` - Senses []sense `json:"senses"` - Sounds []sound `json:"sounds"` - Tags []string `json:"tags"` -} - -type sense struct { - Glosses []string `json:"glosses"` - Examples []example `json:"examples"` -} - -type example struct { - Text string `json:"text"` -} - -type sound struct { - IPA string `json:"ipa"` -} - -type templateReadyDictionaryEntry struct { - Word string - POS string - Etymology string - Senses []SenseForDictionaryEntry - Sound string - Gender string -} - -type SenseForDictionaryEntry struct { - Sense string - Example string -} - -func readDictionary() error { - log.Printf("preparing list of dictionary words...") - - // Set up the template - tmpl, err := template.New("entry").Parse(`<p>{{ .Word }} {{ .Sound }} <i>{{ .POS }} {{ .Gender }}</i></p> - <ol> - {{ range .Senses}} - <li>{{ .Sense }}<br><ul><li><i>{{ .Example }}</i></li></ul></li> - {{ end }} - </ol>`) - if err != nil { - panic(err) - } - - file, err := os.Open(rawDictionary) - if err != nil { - return fmt.Errorf("opening: %w", err) - } - defer file.Close() - - var line int - scanner := bufio.NewScanner(file) - - maxCapacity := 1_000_000 - buf := make([]byte, maxCapacity) - scanner.Buffer(buf, maxCapacity) - - for scanner.Scan() { - line++ - - if line%10000 == 0 && line > 1 { - log.Printf("processed %d lines", line) - } - - var result rawDictionaryEntry - json.Unmarshal([]byte(scanner.Text()), &result) - if result.LangCode != "fr" { - continue - } - - // Create the definition text. - entry := templateReadyDictionaryEntry{ - Word: result.Word, - POS: strings.ToLower(result.POS), - } - if len(result.Etymology) > 0 { - entry.Etymology = result.Etymology[0] - } - if len(result.Sounds) > 0 { - entry.Sound = result.Sounds[0].IPA - } - for _, r := range result.Tags { - var genders []string - if r == "masculine" || r == "feminine" { - genders = append(genders, r) - } - entry.Gender = strings.Join(genders, " / ") - } - for _, s := range result.Senses { - var example string - if len(s.Examples) > 0 { - example = s.Examples[0].Text - } - sense := strings.Join(s.Glosses, "; ") - entry.Senses = append(entry.Senses, SenseForDictionaryEntry{Sense: sense, Example: example}) - } - - out := strings.Builder{} - err := tmpl.Execute(&out, entry) - if err != nil { - return fmt.Errorf("failed to render: %w", err) - } - fmt.Printf("%s", out.String()) - - } - if err := scanner.Err(); err != nil { - return fmt.Errorf("scanning: %w", err) - } - - log.Printf("prepared %d dictionary entries", line) - - return nil -} diff --git a/setup.go b/setup.go new file mode 100644 index 0000000..c4364de --- /dev/null +++ b/setup.go @@ -0,0 +1,147 @@ +package main + +import ( + "bufio" + "database/sql" + "encoding/json" + "fmt" + "html/template" + "log" + "os" + "strings" +) + +type rawDictionaryEntry struct { + Word string `json:"word"` + LangCode string `json:"lang_code"` + POS string `json:"pos_title"` + Etymology []string `json:"etymology_texts"` + Senses []sense `json:"senses"` + Sounds []sound `json:"sounds"` + Tags []string `json:"tags"` +} + +type sense struct { + Glosses []string `json:"glosses"` + Examples []example `json:"examples"` +} + +type example struct { + Text string `json:"text"` +} + +type sound struct { + IPA string `json:"ipa"` +} + +type templateReadyDictionaryEntry struct { + Word string + POS string + Etymology string + Senses []SenseForDictionaryEntry + Sound string + Gender string +} + +type SenseForDictionaryEntry struct { + Sense string + Example string +} + +func readDictionary(db *sql.DB) error { + log.Printf("preparing list of dictionary words...") + + // Set up the template + tmpl, err := template.New("entry").Parse(`<p>{{ .Word }} {{ .Sound }} <i>{{ .POS }} {{ .Gender }}</i></p> + <ol> + {{ range .Senses}} + <li>{{ .Sense }}<br> + {{ if .Example }} + <ul><li><i>{{ .Example }}</i></li></ul></li> + {{ end }} + {{ end }} + </ol>`) + if err != nil { + panic(err) + } + + // Set up a prepared statement + stmt, err := db.Prepare("insert into words(word, definition) values(?, ?)") + if err != nil { + log.Fatal(err) + } + defer stmt.Close() + + file, err := os.Open(rawDictionary) + if err != nil { + return fmt.Errorf("opening: %w", err) + } + defer file.Close() + + var wordsAdded int + scanner := bufio.NewScanner(file) + + maxCapacity := 1_000_000 + buf := make([]byte, maxCapacity) + scanner.Buffer(buf, maxCapacity) + + for scanner.Scan() { + var result rawDictionaryEntry + json.Unmarshal([]byte(scanner.Text()), &result) + if result.LangCode != "fr" { + continue + } + + // Create the definition text. + entry := templateReadyDictionaryEntry{ + Word: result.Word, + POS: strings.ToLower(result.POS), + } + if len(result.Etymology) > 0 { + entry.Etymology = result.Etymology[0] + } + if len(result.Sounds) > 0 { + entry.Sound = result.Sounds[0].IPA + } + for _, r := range result.Tags { + var genders []string + if r == "masculine" || r == "feminine" { + genders = append(genders, r) + } + entry.Gender = strings.Join(genders, " / ") + } + for _, s := range result.Senses { + var example string + if len(s.Examples) > 0 { + example = s.Examples[0].Text + } + sense := strings.Join(s.Glosses, "; ") + entry.Senses = append(entry.Senses, SenseForDictionaryEntry{Sense: sense, Example: example}) + } + + formattedDefinition := strings.Builder{} + err := tmpl.Execute(&formattedDefinition, entry) + if err != nil { + return fmt.Errorf("failed to render: %w", err) + } + + // Insert the entry + _, err = stmt.Exec(entry.Word, formattedDefinition.String()) + if err != nil { + return fmt.Errorf("inserting '%s': %w", entry.Word, err) + } + + wordsAdded++ + if wordsAdded%1000 == 0 && wordsAdded > 1 { + log.Printf("processed %d lines (most recent word was '%s')", wordsAdded, entry.Word) + } + + } + if err := scanner.Err(); err != nil { + return fmt.Errorf("scanning: %w", err) + } + + log.Printf("prepared %d dictionary entries", wordsAdded) + + return nil +} |
