diff options
| author | swedebugia <swedebugia@riseup.net> | 2018-12-10 22:59:09 +0100 |
|---|---|---|
| committer | swedebugia <swedebugia@riseup.net> | 2018-12-11 00:00:09 +0100 |
| commit | 39e63b9b047fd8863ddaaeb627d9e93f2b3d34d2 (patch) | |
| tree | 30d964370a7880aa1c1e403bdbfe22ecd1576e11 /wikidata | |
| parent | pre-release 0.1: (diff) | |
| download | guile-wikidata-39e63b9b047fd8863ddaaeb627d9e93f2b3d34d2.tar.gz | |
wikidata: Add rudimentary SPARQL support, split library in two parts,readme
update README, improve examples.
2 bugs exist:
1) the low-level api proc. does not honor language.
2) SPARQL LIMIT is not honored by the server.
Diffstat (limited to 'wikidata')
| -rw-r--r-- | wikidata/apis.scm | 216 | ||||
| -rw-r--r-- | wikidata/apis.scm~ | 290 | ||||
| -rw-r--r-- | wikidata/sparql.scm | 111 |
3 files changed, 617 insertions, 0 deletions
diff --git a/wikidata/apis.scm b/wikidata/apis.scm new file mode 100644 index 0000000..ac77cf6 --- /dev/null +++ b/wikidata/apis.scm @@ -0,0 +1,216 @@ +;;; Copyright © 2018 swedebugia <swedebugia@riseup.net> +;;; +;;; This file is part of guile-wikidata. +;;; +;;; guile-wikidata is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; guile-wikidata is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with guile-wikidata. If not, see <http://www.gnu.org/licenses/>. + +;; See +;; https://www.mediawiki.org/wiki/API:Presenting_Wikidata_knowledge +;; for a good description of workflow when integrating the Wikidata API into +;; an application. + +;; See the example in the bottom for how to use this library. + +(define-module (wikidata apis) + #:use-module (ice-9 format) + #:use-module (ice-9 optargs) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 receive) + #:use-module (json) + #:use-module (guix http-client) + #:use-module (guix import utils) + #:use-module (guix import json) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-34) + #:use-module (web uri) + #:export (show)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Low-level proc. +;; URI-decorators and fetching +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;TODO implement caching +(define (wdquery-alist uri) + "Fetch the data, return an alist" + (json-fetch-alist uri)) + +;; Inspired by PYPI wikidata_suggest +(define* (search-uri name + #:key + (format 'json) + (language 'en) + (type 'item) + (continue 0) + (limit 'limit)) + + "Build URI for the Wikidata wbsearchentities API." + (let ((url "https://www.wikidata.org/w/api.php") + (& "&") + (= "=")) + (string->uri + (string-append url "?" + "search" = name & + "action" = "wbsearchentities" & + "format" = (symbol->string format) & + "language" = (symbol->string language) & + "type" = (symbol->string type) & + "continue" = (number->string continue) & + "limit" = (number->string limit))))) + +;; Inspired by +;; https://opendata.stackexchange.com/questions/5248/how-to-get-the-name-of-a-wikidata-item +;; TODO add handling of more than one qid. +(define* (getentities-uri qid + #:optional property + #:key (language 'en) + (format 'json)) + "Build URI for the Wikidata wbsearchintities API. PROPERTY is a +string containing one of the props here: +https://www.wikidata.org/wiki/Special:ApiHelp/wbgetentities" + (let* ((url "https://www.wikidata.org/w/api.php") + (& "&") + (= "=") + (u (string-append url "?" + "ids" = qid & + "action" = "wbgetentities" & + "format" = (symbol->string format) & + "language" = (symbol->string language)))) + (string->uri + ;; Handle optional arguments. + (if (symbol? property) + (let ((property (symbol->string property))) + (string-append u & "props" = property)) + ;; No property + u)))) + +;; Only one at a time. +(define* (getclaims-uri qid + #:key (format 'json)) + "Build URI for the Wikidata wbgetclaims API." + (let* ((url "https://www.wikidata.org/w/api.php") + (& "&") + (= "=") + (u (string-append url "?" + "entity" = qid & + "action" = "wbgetclaims" & + "format" = (symbol->string format)))) + (string->uri u))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Medium-level proc. +;; Extract data from queries +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (get-label qid) + (let ((l "label")) + (first + (extract-all + (wdquery-alist (getentities-uri qid l)))) + )) + +(define (get-properties qid) + "Fetches and RETURNS list of all P-property keys from a qid" + (map car ;show all P only first P-statement + (cdr ; ->((p1)(p2(...)))) = list of properties + (first ; ->(claims ((p1)(p2(...)))) + (wdquery-alist + (getclaims-uri qid)))))) + +;;test +;;(display (map getlabel (get-properties "Q180736"))) + +;; TODO factorize nested alist check +(define* (extract-element alist element + #:optional (x 30)) + "Accept unnested ALIST and return the value of ELEMENT. +'qid. Truncate elements to X char, default to 30 char." + (if (if (member element '("label" "description" "id")) #t #f) + ;; True + (if (list? alist) + (let ((result (assoc-ref alist element))) + (if (string? result) + ;; Truncate string + (if (> (string-length result) x) + (string-append (substring result 0 x) "...") + result) + (if (null? result) + (begin + (error "extract-element: No" element "found:") + (display alist)) + (string-append + "(No " element " in the database)")))) + (begin + (error "extract-element: Not a proper list:" ) + (display alist))) + ;; Not one of the accepted elements + (error "extract-element: accepts only the strings: label, description or id"))) + +(define (extract-all alist) + "Extract all elements for a given unnested alist" + (if (list? alist) + (if (not (= 0 (length alist))) + `(("label" . ,(extract-element alist "label")) + ("description" . ,(extract-element alist "description")) + ("id" . ,(extract-element alist "id"))) + (error "extract-all: Nothing found." )) + (begin + (error "extract-all: Not a proper list:") + (display alist)))) + +(define (pretty-print result) + "Takes an unnested alist RESULT and pretty prints it." + (format #t "~a:\t~a~%" + (extract-element result "id") + ;; Join and truncate long labels and descriptions + (let* ((l (extract-element result "label")) + (d (extract-element result "description")) + (ld (string-append l ": " d)) + (x 50)) + (if (> (string-length ld) x) + (string-append (substring ld 0 x) "...") + ld)))) + +(define* (extract-search name + #:key + (language 'en) + (limit 'limit)) + "Returns list with each element being an alist of label, desc, qid" + (map extract-all + (assoc-ref + (wdquery-alist + (search-uri name + #:language language + #:limit limit)) "search"))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; High-level +;; Get search results fast +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define* (search query + #:key + (language 'en) + (limit 10)) ;default to 10 + "Extract results until 'limit' and print them in a pretty truncated way." + (let ((result (extract-search query + #:language language + #:limit limit))) + (begin + (format #t "First ~a:\tLabel & Description~%" limit) + (map pretty-print (take result limit))))) + +;; For example: +(search "paris" #:limit 15 #:language 'es) + diff --git a/wikidata/apis.scm~ b/wikidata/apis.scm~ new file mode 100644 index 0000000..3bea6f5 --- /dev/null +++ b/wikidata/apis.scm~ @@ -0,0 +1,290 @@ +;;; Copyright © 2018 swedebugia <swedebugia@riseup.net> +;;; +;;; This file is part of guile-wikidata. +;;; +;;; guile-wikidata is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; guile-wikidata is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with guile-wikidata. If not, see <http://www.gnu.org/licenses/>. + +;;; See +;;; https://www.mediawiki.org/wiki/API:Presenting_Wikidata_knowledge +;;; for a good description of workflow when integrating the Wikidata API into +;;; an application. + +;;; To generate SPARQL queries I recommend +;;; https://query.wikidata.org/. + +(define-module (wikidata) + #:use-module (ice-9 format) + #:use-module (ice-9 optargs) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 receive) + #:use-module (json) + #:use-module (guix http-client) + #:use-module (guix import utils) +;; #:use-module (sparql driver) ; does not support blazegraph +;; #:use-module (sparql lang) ; did not work ?? + #:use-module (sparql util) + #:use-module (guix import json) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-34) + #:use-module (sxml simple) + #:use-module (web uri) + #:export (show)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Low-level proc. +;; URI-decorators and fetching +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO flesh out http-code from giux into a new guile-http library to +;; avoid pulling in all of guix in this library. +(define* (xml-fetch url + ;; Note: many websites returns 403 if we omit a + ;; 'User-Agent' header. + #:key (headers `((user-agent . "GNU Guile") + (Accept . "application/json")))) + "Return a representation of the JSON resource URL (a list or hash table), or +#f if URL returns 403 or 404. HEADERS is a list of HTTP headers to pass in +the query. Returns RESULT from port." + (guard (c ((and (http-get-error? c) + (let ((error (http-get-error-code c))) + (or (= 403 error) + (= 404 error)))) + #f)) + (let* ((port (http-fetch url #:headers headers)) + ;; Return result without any modification. + (result port)) +;; (close-port port) + result))) + +;;TODO implement caching +(define (wdquery-alist uri) + "Fetch the data, return an alist" + (json-fetch-alist uri)) + +(define (wdquery-xml uri) + "Fetch the data, return PORT" + (xml-fetch uri)) + +;; Inspired by PYPI wikidata_suggest +(define* (search-uri name + #:key + (format 'json) + (language 'en) + (type 'item) + (continue 0) + (limit 10)) + + "Build URI for the Wikidata wbsearchentities API." + (let ((url "https://www.wikidata.org/w/api.php") + (& "&") + (= "=")) + (string->uri + (string-append url "?" + "search" = name & + "action" = "wbsearchentities" & + "format" = (symbol->string format) & + "language" = (symbol->string language) & + "type" = (symbol->string type) & + "continue" = (number->string continue) & + "limit" = (number->string limit) + )))) + +;; Inspired by +;; https://opendata.stackexchange.com/questions/5248/how-to-get-the-name-of-a-wikidata-item +;; TODO add handling of more than one qid. +(define* (getentities-uri qid + #:optional property + #:key (language 'en) + (format 'json)) + "Build URI for the Wikidata wbsearchintities API. PROPERTY is a +string containing one of the props here: +https://www.wikidata.org/wiki/Special:ApiHelp/wbgetentities" + (let* ((url "https://www.wikidata.org/w/api.php") + (& "&") + (= "=") + (u (string-append url "?" + "ids" = qid & + "action" = "wbgetentities" & + "format" = (symbol->string format) & + "language" = (symbol->string language)))) + (string->uri + ;; Handle optional arguments. + (if (symbol? property) + (let ((property (symbol->string property))) + (string-append u & "props" = property)) + ;; No property + u)))) + +;; Only one at a time. +(define* (getclaims-uri qid + #:key (format 'json)) + "Build URI for the Wikidata wbgetclaims API." + (let* ((url "https://www.wikidata.org/w/api.php") + (& "&") + (= "=") + (u (string-append url "?" + "entity" = qid & + "action" = "wbgetclaims" & + "format" = (symbol->string format)))) + (string->uri u))) + +;; Inspired by http://r.duckduckgo.com/l/?kh=-1&uddg=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F29886388%2Fddg%2335118127 +;;; +;;; Wikidata-specific SPARQL-QUERY using a GET request (it also accept POST) +;;; --------------------------------------------------------------------------- +(define* (wdsparql-uri query + #:key + (uri #f) + (type "json")) + "Build URI for the Wikidata HTTP GET SPARQL API." + (let* ((get-uri "http://query.wikidata.org/sparql") + (get-url (if uri + uri + get-uri))) + (string->uri + (string-append get-url "?" (uri-encode query)) + ) + )) + +;; PREFIX wd: <http://www.wikidata.org/entity/> +;; PREFIX wdt: <http://www.wikidata.org/prop/direct/> + +;; SELECT DISTINCT ?item +;; WHERE { +;; ?item wdt:P31/wdt:P279* wd:Q19723451 +;; } + +;; broken +;; (display-query-results-of +;; (wdquery-xml +;; (wdsparql-uri +;; "PREFIX wd: <http://www.wikidata.org/entity/> +;; PREFIX wdt: <http://www.wikidata.org/prop/direct/> SELECT DISTINCT +;; ?item WHERE \\{?item wdt:P31/wdt:P279* wd:Q19723451\\}" +;; ))) + +;; Broken attempt to use SEXP from (sparql lang). +;; (let ( +;; (wd (prefix "http://www.wikidata.org/entity/")) +;; (wdt (prefix "http://www.wikidata.org/prop/direct/"))) +;; (select #:destinct +;; ;; columns +;; ;;'(subject predicate object) +;; `((,'item))) +;; ;; pattern +;; ;; `((subject predicate object) +;; ;; (subject ,(rdf "type") ,(internal "Sample") +;; `(((,'where ,'item) (,wdt ":P31/" ,wdt ":P279*") (,wd ":Q19723451"))) +;; ))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Medium-level proc. +;; Extract data from queries +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (get-label qid) + (let ((l "label")) + (first + (extract-all + (wdquery-alist (getentities-uri qid l)))) + )) + +(define (get-properties qid) + "Fetches and RETURNS list of all P-property keys from a qid" + (map car ;show all P only first P-statement + (cdr ; ->((p1)(p2(...)))) = list of properties + (first ; ->(claims ((p1)(p2(...)))) + (wdquery-alist + (getclaims-uri qid)))))) + +;;test +;;(display (map getlabel (get-properties "Q180736"))) + +;; TODO factorize nested alist check +(define* (extract-element alist element + #:optional (x 30)) + "Accept unnested ALIST and return the value of ELEMENT. +'qid. Truncate elements to X char, default to 30 char." + (if (if (member element '("label" "description" "id")) #t #f) + ;; True + (if (list? alist) + (let ((result (assoc-ref alist element))) + (if (string? result) + ;; Truncate string + (if (> (string-length result) x) + (string-append (substring result 0 x) "...") + result) + (if (null? result) + (begin + (error "extract-element: No" element "found:") + (display alist)) + (string-append + "(No " element " in the database)")))) + (begin + (error "extract-element: Not a proper list:" ) + (display alist))) + ;; Not one of the accepted elements + (error "extract-element: accepts only the strings: label, description or id"))) + +(define (extract-all alist) + "Extract all elements for a given unnested alist" + (if (list? alist) + (if (not (= 0 (length alist))) + `(("label" . ,(extract-element alist "label")) + ("description" . ,(extract-element alist "description")) + ("id" . ,(extract-element alist "id"))) + (error "extract-all: Nothing found." )) + (begin + (error "extract-all: Not a proper list:") + (display alist)))) + +(define (pretty-print result) + "Takes an unnested alist RESULT and pretty prints it." + (format #t "~a:\t~a~%" + (extract-element result "id") + ;; Join and truncate long labels and descriptions + (let* ((l (extract-element result "label")) + (d (extract-element result "description")) + (ld (string-append l ": " d)) + (x 50)) + (if (> (string-length ld) x) + (string-append (substring ld 0 x) "...") + ld + )))) + +(define* (extract-search name + #:key language) + "Returns list with each element being an alist of label, desc, qid" + (map extract-all + (assoc-ref (wdquery-alist + (search-uri name #:language language)) "search"))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; High-level +;; Get the results fast +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define* (search query x + #:key language) + "Extract first x results and print them in a pretty truncated way." + (let ((result (extract-search query))) + (begin + (format #t "First ~a:\tLabel & Description~%" x) + (map pretty-print (take result x)) + ) + )) + +;; For example: +;;(search "guix" 10 #:language 'fr) + diff --git a/wikidata/sparql.scm b/wikidata/sparql.scm new file mode 100644 index 0000000..5b71152 --- /dev/null +++ b/wikidata/sparql.scm @@ -0,0 +1,111 @@ +;;; Copyright © 2018 swedebugia <swedebugia@riseup.net> +;;; +;;; This file is part of guile-wikidata. +;;; +;;; guile-wikidata is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; guile-wikidata is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with guile-wikidata. If not, see <http://www.gnu.org/licenses/>. + +;; See the example-sparql.scm for how to use this library. + +(define-module (wikidata sparql) + #:use-module (ice-9 format) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 receive) + #:use-module (guix http-client) +;; #:use-module (guix import utils) ; useful stuff there +;; #:use-module (sparql driver) ; does not support blazegraph +;; #:use-module (sparql lang) ; did not work ?? + #:use-module (sparql util) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-34) + #:use-module (web uri) + #:export (show-sparql)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Low-level proc. +;; URI-decorators and fetching +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; TODO flesh out http-code from giux into a new guile-http library to +;; avoid pulling in all of guix in this library. +(define* (xml-fetch url + ;; Note: many websites returns 403 if we omit a + ;; 'User-Agent' header. + #:key (headers `((user-agent . "GNU Guile") + (Accept . "application/json")))) + "Return a representation of the JSON resource URL (a list or hash table), or +#f if URL returns 403 or 404. HEADERS is a list of HTTP headers to pass in +the query. Returns RESULT from port." + (guard (c ((and (http-get-error? c) + (let ((error (http-get-error-code c))) + (or (= 403 error) + (= 404 error)))) + #f)) + (let* ((port (http-fetch url #:headers headers)) + ;; Return result without any modification. + (result port)) +;; (close-port port) + result))) + +(define (wdquery-xml uri) + "Fetch the data, return PORT" + (xml-fetch uri)) + + +;; Inspired by http://r.duckduckgo.com/l/?kh=-1&uddg=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F29886388%2Fddg%2335118127 +;;; +;;; Wikidata-specific SPARQL-QUERY using a GET request (it also accept POST) +;;; --------------------------------------------------------------------------- +(define* (wdsparql-uri query + #:key + (uri #f) + (type "json")) + "Build URI for the Wikidata HTTP GET SPARQL API." + (let* ((get-uri "http://query.wikidata.org/sparql") + (get-url (if uri + uri + get-uri))) + (string->uri + (string-append get-url "?" (uri-encode query)) + ) + )) + + +;;; +;;; Medium-level proc. +;;; --------------------------------------------------------------------------- + +;; TODO add rationale for why this is copied from (sparql util) +(define (display-query-results port) + "Format the output from the port and close it" + (begin + (let ((line (read-line port))) + (if (eof-object? line) + #t + ;; The default output format is comma-separated values (CSV). + (let ((tokens (string-split line #\,))) + (format #t "~{~a~/~}~%" tokens) + (display-query-results port)))) + (close-port port))) + +;;; +;;; High-level proc. +;;; --------------------------------------------------------------------------- + +;; See example-sparql.scm for how to enter the query +(define (show-sparql query) + "Run the query on the Wikidata Blazegraph server. Show the result on current-output-port." + (display-query-results + (xml-fetch + (wdsparql-uri + query)))) |
