summaryrefslogtreecommitdiff
path: root/wikidata
diff options
context:
space:
mode:
authorswedebugia <swedebugia@riseup.net>2018-12-10 22:59:09 +0100
committerswedebugia <swedebugia@riseup.net>2018-12-11 00:00:09 +0100
commit39e63b9b047fd8863ddaaeb627d9e93f2b3d34d2 (patch)
tree30d964370a7880aa1c1e403bdbfe22ecd1576e11 /wikidata
parentpre-release 0.1: (diff)
downloadguile-wikidata-39e63b9b047fd8863ddaaeb627d9e93f2b3d34d2.tar.gz
wikidata: Add rudimentary SPARQL support, split library in two parts,readme
update README, improve examples. 2 bugs exist: 1) the low-level api proc. does not honor language. 2) SPARQL LIMIT is not honored by the server.
Diffstat (limited to 'wikidata')
-rw-r--r--wikidata/apis.scm216
-rw-r--r--wikidata/apis.scm~290
-rw-r--r--wikidata/sparql.scm111
3 files changed, 617 insertions, 0 deletions
diff --git a/wikidata/apis.scm b/wikidata/apis.scm
new file mode 100644
index 0000000..ac77cf6
--- /dev/null
+++ b/wikidata/apis.scm
@@ -0,0 +1,216 @@
+;;; Copyright © 2018 swedebugia <swedebugia@riseup.net>
+;;;
+;;; This file is part of guile-wikidata.
+;;;
+;;; guile-wikidata is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; guile-wikidata is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with guile-wikidata. If not, see <http://www.gnu.org/licenses/>.
+
+;; See
+;; https://www.mediawiki.org/wiki/API:Presenting_Wikidata_knowledge
+;; for a good description of workflow when integrating the Wikidata API into
+;; an application.
+
+;; See the example in the bottom for how to use this library.
+
+(define-module (wikidata apis)
+ #:use-module (ice-9 format)
+ #:use-module (ice-9 optargs)
+ #:use-module (ice-9 rdelim)
+ #:use-module (ice-9 receive)
+ #:use-module (json)
+ #:use-module (guix http-client)
+ #:use-module (guix import utils)
+ #:use-module (guix import json)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-34)
+ #:use-module (web uri)
+ #:export (show))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Low-level proc.
+;; URI-decorators and fetching
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;TODO implement caching
+(define (wdquery-alist uri)
+ "Fetch the data, return an alist"
+ (json-fetch-alist uri))
+
+;; Inspired by PYPI wikidata_suggest
+(define* (search-uri name
+ #:key
+ (format 'json)
+ (language 'en)
+ (type 'item)
+ (continue 0)
+ (limit 'limit))
+
+ "Build URI for the Wikidata wbsearchentities API."
+ (let ((url "https://www.wikidata.org/w/api.php")
+ (& "&")
+ (= "="))
+ (string->uri
+ (string-append url "?"
+ "search" = name &
+ "action" = "wbsearchentities" &
+ "format" = (symbol->string format) &
+ "language" = (symbol->string language) &
+ "type" = (symbol->string type) &
+ "continue" = (number->string continue) &
+ "limit" = (number->string limit)))))
+
+;; Inspired by
+;; https://opendata.stackexchange.com/questions/5248/how-to-get-the-name-of-a-wikidata-item
+;; TODO add handling of more than one qid.
+(define* (getentities-uri qid
+ #:optional property
+ #:key (language 'en)
+ (format 'json))
+ "Build URI for the Wikidata wbsearchintities API. PROPERTY is a
+string containing one of the props here:
+https://www.wikidata.org/wiki/Special:ApiHelp/wbgetentities"
+ (let* ((url "https://www.wikidata.org/w/api.php")
+ (& "&")
+ (= "=")
+ (u (string-append url "?"
+ "ids" = qid &
+ "action" = "wbgetentities" &
+ "format" = (symbol->string format) &
+ "language" = (symbol->string language))))
+ (string->uri
+ ;; Handle optional arguments.
+ (if (symbol? property)
+ (let ((property (symbol->string property)))
+ (string-append u & "props" = property))
+ ;; No property
+ u))))
+
+;; Only one at a time.
+(define* (getclaims-uri qid
+ #:key (format 'json))
+ "Build URI for the Wikidata wbgetclaims API."
+ (let* ((url "https://www.wikidata.org/w/api.php")
+ (& "&")
+ (= "=")
+ (u (string-append url "?"
+ "entity" = qid &
+ "action" = "wbgetclaims" &
+ "format" = (symbol->string format))))
+ (string->uri u)))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Medium-level proc.
+;; Extract data from queries
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (get-label qid)
+ (let ((l "label"))
+ (first
+ (extract-all
+ (wdquery-alist (getentities-uri qid l))))
+ ))
+
+(define (get-properties qid)
+ "Fetches and RETURNS list of all P-property keys from a qid"
+ (map car ;show all P only first P-statement
+ (cdr ; ->((p1)(p2(...)))) = list of properties
+ (first ; ->(claims ((p1)(p2(...))))
+ (wdquery-alist
+ (getclaims-uri qid))))))
+
+;;test
+;;(display (map getlabel (get-properties "Q180736")))
+
+;; TODO factorize nested alist check
+(define* (extract-element alist element
+ #:optional (x 30))
+ "Accept unnested ALIST and return the value of ELEMENT.
+'qid. Truncate elements to X char, default to 30 char."
+ (if (if (member element '("label" "description" "id")) #t #f)
+ ;; True
+ (if (list? alist)
+ (let ((result (assoc-ref alist element)))
+ (if (string? result)
+ ;; Truncate string
+ (if (> (string-length result) x)
+ (string-append (substring result 0 x) "...")
+ result)
+ (if (null? result)
+ (begin
+ (error "extract-element: No" element "found:")
+ (display alist))
+ (string-append
+ "(No " element " in the database)"))))
+ (begin
+ (error "extract-element: Not a proper list:" )
+ (display alist)))
+ ;; Not one of the accepted elements
+ (error "extract-element: accepts only the strings: label, description or id")))
+
+(define (extract-all alist)
+ "Extract all elements for a given unnested alist"
+ (if (list? alist)
+ (if (not (= 0 (length alist)))
+ `(("label" . ,(extract-element alist "label"))
+ ("description" . ,(extract-element alist "description"))
+ ("id" . ,(extract-element alist "id")))
+ (error "extract-all: Nothing found." ))
+ (begin
+ (error "extract-all: Not a proper list:")
+ (display alist))))
+
+(define (pretty-print result)
+ "Takes an unnested alist RESULT and pretty prints it."
+ (format #t "~a:\t~a~%"
+ (extract-element result "id")
+ ;; Join and truncate long labels and descriptions
+ (let* ((l (extract-element result "label"))
+ (d (extract-element result "description"))
+ (ld (string-append l ": " d))
+ (x 50))
+ (if (> (string-length ld) x)
+ (string-append (substring ld 0 x) "...")
+ ld))))
+
+(define* (extract-search name
+ #:key
+ (language 'en)
+ (limit 'limit))
+ "Returns list with each element being an alist of label, desc, qid"
+ (map extract-all
+ (assoc-ref
+ (wdquery-alist
+ (search-uri name
+ #:language language
+ #:limit limit)) "search")))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; High-level
+;; Get search results fast
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define* (search query
+ #:key
+ (language 'en)
+ (limit 10)) ;default to 10
+ "Extract results until 'limit' and print them in a pretty truncated way."
+ (let ((result (extract-search query
+ #:language language
+ #:limit limit)))
+ (begin
+ (format #t "First ~a:\tLabel & Description~%" limit)
+ (map pretty-print (take result limit)))))
+
+;; For example:
+(search "paris" #:limit 15 #:language 'es)
+
diff --git a/wikidata/apis.scm~ b/wikidata/apis.scm~
new file mode 100644
index 0000000..3bea6f5
--- /dev/null
+++ b/wikidata/apis.scm~
@@ -0,0 +1,290 @@
+;;; Copyright © 2018 swedebugia <swedebugia@riseup.net>
+;;;
+;;; This file is part of guile-wikidata.
+;;;
+;;; guile-wikidata is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; guile-wikidata is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with guile-wikidata. If not, see <http://www.gnu.org/licenses/>.
+
+;;; See
+;;; https://www.mediawiki.org/wiki/API:Presenting_Wikidata_knowledge
+;;; for a good description of workflow when integrating the Wikidata API into
+;;; an application.
+
+;;; To generate SPARQL queries I recommend
+;;; https://query.wikidata.org/.
+
+(define-module (wikidata)
+ #:use-module (ice-9 format)
+ #:use-module (ice-9 optargs)
+ #:use-module (ice-9 rdelim)
+ #:use-module (ice-9 receive)
+ #:use-module (json)
+ #:use-module (guix http-client)
+ #:use-module (guix import utils)
+;; #:use-module (sparql driver) ; does not support blazegraph
+;; #:use-module (sparql lang) ; did not work ??
+ #:use-module (sparql util)
+ #:use-module (guix import json)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-34)
+ #:use-module (sxml simple)
+ #:use-module (web uri)
+ #:export (show))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Low-level proc.
+;; URI-decorators and fetching
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO flesh out http-code from giux into a new guile-http library to
+;; avoid pulling in all of guix in this library.
+(define* (xml-fetch url
+ ;; Note: many websites returns 403 if we omit a
+ ;; 'User-Agent' header.
+ #:key (headers `((user-agent . "GNU Guile")
+ (Accept . "application/json"))))
+ "Return a representation of the JSON resource URL (a list or hash table), or
+#f if URL returns 403 or 404. HEADERS is a list of HTTP headers to pass in
+the query. Returns RESULT from port."
+ (guard (c ((and (http-get-error? c)
+ (let ((error (http-get-error-code c)))
+ (or (= 403 error)
+ (= 404 error))))
+ #f))
+ (let* ((port (http-fetch url #:headers headers))
+ ;; Return result without any modification.
+ (result port))
+;; (close-port port)
+ result)))
+
+;;TODO implement caching
+(define (wdquery-alist uri)
+ "Fetch the data, return an alist"
+ (json-fetch-alist uri))
+
+(define (wdquery-xml uri)
+ "Fetch the data, return PORT"
+ (xml-fetch uri))
+
+;; Inspired by PYPI wikidata_suggest
+(define* (search-uri name
+ #:key
+ (format 'json)
+ (language 'en)
+ (type 'item)
+ (continue 0)
+ (limit 10))
+
+ "Build URI for the Wikidata wbsearchentities API."
+ (let ((url "https://www.wikidata.org/w/api.php")
+ (& "&")
+ (= "="))
+ (string->uri
+ (string-append url "?"
+ "search" = name &
+ "action" = "wbsearchentities" &
+ "format" = (symbol->string format) &
+ "language" = (symbol->string language) &
+ "type" = (symbol->string type) &
+ "continue" = (number->string continue) &
+ "limit" = (number->string limit)
+ ))))
+
+;; Inspired by
+;; https://opendata.stackexchange.com/questions/5248/how-to-get-the-name-of-a-wikidata-item
+;; TODO add handling of more than one qid.
+(define* (getentities-uri qid
+ #:optional property
+ #:key (language 'en)
+ (format 'json))
+ "Build URI for the Wikidata wbsearchintities API. PROPERTY is a
+string containing one of the props here:
+https://www.wikidata.org/wiki/Special:ApiHelp/wbgetentities"
+ (let* ((url "https://www.wikidata.org/w/api.php")
+ (& "&")
+ (= "=")
+ (u (string-append url "?"
+ "ids" = qid &
+ "action" = "wbgetentities" &
+ "format" = (symbol->string format) &
+ "language" = (symbol->string language))))
+ (string->uri
+ ;; Handle optional arguments.
+ (if (symbol? property)
+ (let ((property (symbol->string property)))
+ (string-append u & "props" = property))
+ ;; No property
+ u))))
+
+;; Only one at a time.
+(define* (getclaims-uri qid
+ #:key (format 'json))
+ "Build URI for the Wikidata wbgetclaims API."
+ (let* ((url "https://www.wikidata.org/w/api.php")
+ (& "&")
+ (= "=")
+ (u (string-append url "?"
+ "entity" = qid &
+ "action" = "wbgetclaims" &
+ "format" = (symbol->string format))))
+ (string->uri u)))
+
+;; Inspired by http://r.duckduckgo.com/l/?kh=-1&uddg=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F29886388%2Fddg%2335118127
+;;;
+;;; Wikidata-specific SPARQL-QUERY using a GET request (it also accept POST)
+;;; ---------------------------------------------------------------------------
+(define* (wdsparql-uri query
+ #:key
+ (uri #f)
+ (type "json"))
+ "Build URI for the Wikidata HTTP GET SPARQL API."
+ (let* ((get-uri "http://query.wikidata.org/sparql")
+ (get-url (if uri
+ uri
+ get-uri)))
+ (string->uri
+ (string-append get-url "?" (uri-encode query))
+ )
+ ))
+
+;; PREFIX wd: <http://www.wikidata.org/entity/>
+;; PREFIX wdt: <http://www.wikidata.org/prop/direct/>
+
+;; SELECT DISTINCT ?item
+;; WHERE {
+;; ?item wdt:P31/wdt:P279* wd:Q19723451
+;; }
+
+;; broken
+;; (display-query-results-of
+;; (wdquery-xml
+;; (wdsparql-uri
+;; "PREFIX wd: <http://www.wikidata.org/entity/>
+;; PREFIX wdt: <http://www.wikidata.org/prop/direct/> SELECT DISTINCT
+;; ?item WHERE \\{?item wdt:P31/wdt:P279* wd:Q19723451\\}"
+;; )))
+
+;; Broken attempt to use SEXP from (sparql lang).
+;; (let (
+;; (wd (prefix "http://www.wikidata.org/entity/"))
+;; (wdt (prefix "http://www.wikidata.org/prop/direct/")))
+;; (select #:destinct
+;; ;; columns
+;; ;;'(subject predicate object)
+;; `((,'item)))
+;; ;; pattern
+;; ;; `((subject predicate object)
+;; ;; (subject ,(rdf "type") ,(internal "Sample")
+;; `(((,'where ,'item) (,wdt ":P31/" ,wdt ":P279*") (,wd ":Q19723451")))
+;; )))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Medium-level proc.
+;; Extract data from queries
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (get-label qid)
+ (let ((l "label"))
+ (first
+ (extract-all
+ (wdquery-alist (getentities-uri qid l))))
+ ))
+
+(define (get-properties qid)
+ "Fetches and RETURNS list of all P-property keys from a qid"
+ (map car ;show all P only first P-statement
+ (cdr ; ->((p1)(p2(...)))) = list of properties
+ (first ; ->(claims ((p1)(p2(...))))
+ (wdquery-alist
+ (getclaims-uri qid))))))
+
+;;test
+;;(display (map getlabel (get-properties "Q180736")))
+
+;; TODO factorize nested alist check
+(define* (extract-element alist element
+ #:optional (x 30))
+ "Accept unnested ALIST and return the value of ELEMENT.
+'qid. Truncate elements to X char, default to 30 char."
+ (if (if (member element '("label" "description" "id")) #t #f)
+ ;; True
+ (if (list? alist)
+ (let ((result (assoc-ref alist element)))
+ (if (string? result)
+ ;; Truncate string
+ (if (> (string-length result) x)
+ (string-append (substring result 0 x) "...")
+ result)
+ (if (null? result)
+ (begin
+ (error "extract-element: No" element "found:")
+ (display alist))
+ (string-append
+ "(No " element " in the database)"))))
+ (begin
+ (error "extract-element: Not a proper list:" )
+ (display alist)))
+ ;; Not one of the accepted elements
+ (error "extract-element: accepts only the strings: label, description or id")))
+
+(define (extract-all alist)
+ "Extract all elements for a given unnested alist"
+ (if (list? alist)
+ (if (not (= 0 (length alist)))
+ `(("label" . ,(extract-element alist "label"))
+ ("description" . ,(extract-element alist "description"))
+ ("id" . ,(extract-element alist "id")))
+ (error "extract-all: Nothing found." ))
+ (begin
+ (error "extract-all: Not a proper list:")
+ (display alist))))
+
+(define (pretty-print result)
+ "Takes an unnested alist RESULT and pretty prints it."
+ (format #t "~a:\t~a~%"
+ (extract-element result "id")
+ ;; Join and truncate long labels and descriptions
+ (let* ((l (extract-element result "label"))
+ (d (extract-element result "description"))
+ (ld (string-append l ": " d))
+ (x 50))
+ (if (> (string-length ld) x)
+ (string-append (substring ld 0 x) "...")
+ ld
+ ))))
+
+(define* (extract-search name
+ #:key language)
+ "Returns list with each element being an alist of label, desc, qid"
+ (map extract-all
+ (assoc-ref (wdquery-alist
+ (search-uri name #:language language)) "search")))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; High-level
+;; Get the results fast
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define* (search query x
+ #:key language)
+ "Extract first x results and print them in a pretty truncated way."
+ (let ((result (extract-search query)))
+ (begin
+ (format #t "First ~a:\tLabel & Description~%" x)
+ (map pretty-print (take result x))
+ )
+ ))
+
+;; For example:
+;;(search "guix" 10 #:language 'fr)
+
diff --git a/wikidata/sparql.scm b/wikidata/sparql.scm
new file mode 100644
index 0000000..5b71152
--- /dev/null
+++ b/wikidata/sparql.scm
@@ -0,0 +1,111 @@
+;;; Copyright © 2018 swedebugia <swedebugia@riseup.net>
+;;;
+;;; This file is part of guile-wikidata.
+;;;
+;;; guile-wikidata is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; guile-wikidata is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with guile-wikidata. If not, see <http://www.gnu.org/licenses/>.
+
+;; See the example-sparql.scm for how to use this library.
+
+(define-module (wikidata sparql)
+ #:use-module (ice-9 format)
+ #:use-module (ice-9 rdelim)
+ #:use-module (ice-9 receive)
+ #:use-module (guix http-client)
+;; #:use-module (guix import utils) ; useful stuff there
+;; #:use-module (sparql driver) ; does not support blazegraph
+;; #:use-module (sparql lang) ; did not work ??
+ #:use-module (sparql util)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-34)
+ #:use-module (web uri)
+ #:export (show-sparql))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Low-level proc.
+;; URI-decorators and fetching
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO flesh out http-code from giux into a new guile-http library to
+;; avoid pulling in all of guix in this library.
+(define* (xml-fetch url
+ ;; Note: many websites returns 403 if we omit a
+ ;; 'User-Agent' header.
+ #:key (headers `((user-agent . "GNU Guile")
+ (Accept . "application/json"))))
+ "Return a representation of the JSON resource URL (a list or hash table), or
+#f if URL returns 403 or 404. HEADERS is a list of HTTP headers to pass in
+the query. Returns RESULT from port."
+ (guard (c ((and (http-get-error? c)
+ (let ((error (http-get-error-code c)))
+ (or (= 403 error)
+ (= 404 error))))
+ #f))
+ (let* ((port (http-fetch url #:headers headers))
+ ;; Return result without any modification.
+ (result port))
+;; (close-port port)
+ result)))
+
+(define (wdquery-xml uri)
+ "Fetch the data, return PORT"
+ (xml-fetch uri))
+
+
+;; Inspired by http://r.duckduckgo.com/l/?kh=-1&uddg=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F29886388%2Fddg%2335118127
+;;;
+;;; Wikidata-specific SPARQL-QUERY using a GET request (it also accept POST)
+;;; ---------------------------------------------------------------------------
+(define* (wdsparql-uri query
+ #:key
+ (uri #f)
+ (type "json"))
+ "Build URI for the Wikidata HTTP GET SPARQL API."
+ (let* ((get-uri "http://query.wikidata.org/sparql")
+ (get-url (if uri
+ uri
+ get-uri)))
+ (string->uri
+ (string-append get-url "?" (uri-encode query))
+ )
+ ))
+
+
+;;;
+;;; Medium-level proc.
+;;; ---------------------------------------------------------------------------
+
+;; TODO add rationale for why this is copied from (sparql util)
+(define (display-query-results port)
+ "Format the output from the port and close it"
+ (begin
+ (let ((line (read-line port)))
+ (if (eof-object? line)
+ #t
+ ;; The default output format is comma-separated values (CSV).
+ (let ((tokens (string-split line #\,)))
+ (format #t "~{~a~/~}~%" tokens)
+ (display-query-results port))))
+ (close-port port)))
+
+;;;
+;;; High-level proc.
+;;; ---------------------------------------------------------------------------
+
+;; See example-sparql.scm for how to enter the query
+(define (show-sparql query)
+ "Run the query on the Wikidata Blazegraph server. Show the result on current-output-port."
+ (display-query-results
+ (xml-fetch
+ (wdsparql-uri
+ query))))