Ingestão dump_wikidata no PostgreSQL - datasets-br/city-codes GitHub Wiki

Exemplos:

-- lista arquivos JSON das cidades do Estado de São Paulo:
select pg_ls_dir('/opt/gits/city-codes-BR/data/dump_wikidata/SP'); 
 -- recupera uma claim específica:
SELECT (x->'claims'->'P402'->0->>'value')::bigint AS osm_id
FROM (
  select  pg_read_file('/opt/gits/city-codes-BR/data/dump_wikidata/SP/Itabera.json',true)::jsonb
) t(x);

Função de varredura

CREATE or replace FUNCTION ingest.wikidatafolder_input_files(
  p_fpath text DEFAULT '/opt/gits/city-codes-BR/data/dump_wikidata'
) RETURNS TABLE (
  cityname text, fname text, fmeta jsonb, jcontent jsonb
) AS $f$
  WITH t0 AS ( SELECT rtrim(p_fpath,'/') AS fpath )
  , t1 AS (
    SELECT f as statename,
           t0.fpath ||'/'|| f as f
    FROM pg_ls_dir((SELECT fpath FROM t0)) t(f), t0
  ), t3 AS ( 
    SELECT t1.statename, t1.f as fpath, t2.f2 as fname
    FROM t1, LATERAL pg_ls_dir(t1.f) t2(f2)
    WHERE f2 ~ '\.json$'
) 
  SELECT statename, fname,
         to_jsonb( pg_stat_file(f) ) || jsonb_build_object('fpath',fpath) fmeta,
         pg_read_file(f,true)::jsonb as jcontent
  FROM  ( SELECT *, fpath ||'/'|| fname as f FROM t3 ) t4
$f$ language SQL immutable;

CREATE TABLE cache_wikidata_city AS 
  SELECT cityname, fname, jcontent->>'id' AS wikidata_id, 
      (jcontent->'claims'->'P402'->0->>'value')::bigint AS osm_id, 
      (jcontent->'claims'->'P1585'->0->>'value')::bigint AS local_id
  FROM ingest.wikidatafolder_input_files()
;
-- confirmar no OSM com relation https://www.openstreetmap.org/relation/{osm_id}
-- e conferindo tags 'IBGE:GEOCODIGO' (mesmo que local_id), 'admin_level' (8 para municípios),
--   'boundary' (administrative), 'type'  (boundary) e wikidata	(mesmo código com "Q").

-- CORRECAO DE BRASILIA (nao usar o código da UF (53) mas do município
update cache_wikidata_city set local_id=5300108 where wikidata_id='Q2844' and osm_id='2758138';

Regatando do modelo antigo

Na antiga base dl03t_main:

COPY (
  SELECT
    COALESCE(w.osm_id::bigint,-76*10000000 -ibge_id::bigint) AS osm_id, 
    76         AS jurisd_base_id, 
    ibge_id    AS jurisd_local_id,
    c.name, 
    c.state    AS parent_abbrev,
    c.abbrev3  AS abbrev,
    c.wikidata_id, c.lexlabel, c.isolabel_ext, c.ddd, c.info
  FROM ingest.city c 
       LEFT JOIN cache_wikidata_city w ON 'Q'||c.wikidata_id=w.wikidata_id AND w.local_id::int=ibge_id
  ORDER BY 2,3
) TO '/tmp/jurisdiction.csv' CSV HEADER
;