Generating geolocation data using java and mysql

A few of my projects involved creating a strictly auto-completing textfield based on location data. The autocomplete is reasonably straight forward so I’m leaving that as a casual exercise for the reader. This article presents pointers on where to get geographical information, extract pertinent information, then store that in a mysql table for quick lookup. Having this data helped me answer a multitude of questions, to mention a few:

  • What’s the nearest city (of a certain size) to a given geocode?
  • Given a geocode, find the timezone. (accuracy depends on the amount of data you wish to store)
  • Given a substring, find all cities and/or states whose names contain the substring.
  • Find the number of cities in a 25 mile radius around a given geocode.

Let’s jump right in.  For my projects I’ll keep track of a few pieces of data using a mysql table:

CREATE TABLE `geonames` (
  `geonames_id` bigint(20) unsigned NOT NULL auto_increment,
  `city_name` varchar(200) NOT NULL default '',
  `state_code` varchar(20) NOT NULL default '',
  `country_code` varchar(4) NOT NULL default '',
  `latitude` float(8,5) NOT NULL,
  `longitude` float(8,5) NOT NULL,
  `timezone` varchar(200) NOT NULL default '',
  `population` int(10) NOT NULL default '0',
  PRIMARY KEY  (`geonames_id`),
  KEY `index1` USING BTREE (`latitude`),
  KEY `index2` USING BTREE (`longitude`),
  KEY `index3` USING BTREE (`latitude`,`longitude`)
)

As you can see, the table keeps track of the city, state, country, latitude, longitude, timezone, and population count of a given location. The table declares BTREE indexes for the latitude-longitude information.  This is fine enough for my purposes.

Having this table would be of not much use without the data.  Where is it, you ask?
HERE: http://download.geonames.org/export/dump/

For our purposes, I’m using US.zip, and CA.zip. The contents of the zip files contain much more data than we’re interested in. This is where CODING FUN begins.

The following class parses the unpacked US.txt, CA.txt files and transforms them to tab delimited files that then can be manually loaded into mysql.

This code only writes information if it encounters a place with population, and the population is above a certain number. Specifically, it will only pick up places with more than 5000 people.

package com.tripgather.util;

import java.io.*;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This class contains utility methods to parse geonames.org files and transform
 * them into something that is selfishly useful.
 *
 * Please familiarize yourself with GeoNames Feature Codes at:
 * http://www.geonames.org/export/codes.html
 *
 * Data can be downloaded from: http://download.geonames.org/export/dump/
 *
 * @author mdanter
 */
public class GeonamesTransformer {

    /**
     * The separator to use for splitting the input and outputting the
     * transformed file.
     */
    private static final String SEPARATOR_CHAR = "t";
    /**
     * Designates newline.
     */
    private static final String NEWLINE_CHAR = "n";
    /**
     * Initial size for BufferedWriter, large #s yield better performance.
     */
    private static final int BUFFER_SIZE = 99999;
    /**
     * Determines what size a populated are has to be in order to be output.
     */
    private static final int POPULATION_THRESHOLD = 5000;
    /**
     * The index of the column in the geonames file.
     */
    private static final int POPULATION_SIZE = 14;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_DESIGNATION = 6;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_ID = 0;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_CITY = 2;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_STATE = 10;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_LAT = 4;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_LNG = 5;
    /**
     * The index of the column in the geonames file.
     */
    private static final int LOCATION_TZINFO = 17;
    /**
     * Country code we want to appear in the output for USA.
     */
    private static final String COUNTRY_CODE_US = "US";
    /**
     * Country code we want to appear in the output for Canada.
     */
    private static final String COUNTRY_CODE_CA = "CA";
    /**
     * Used to correlate Canadian province codes with 2 letter acronym.
     */
    private static final HashMap< String, String > CANADIAN_PROVINCES = new HashMap< String, String >(13);

    static {
        CANADIAN_PROVINCES.put("01", "AB");
        CANADIAN_PROVINCES.put("02", "BC");
        CANADIAN_PROVINCES.put("03", "MB");
        CANADIAN_PROVINCES.put("04", "NB");
        CANADIAN_PROVINCES.put("05", "NL");
        CANADIAN_PROVINCES.put("07", "NS");
        CANADIAN_PROVINCES.put("13", "NT");
        CANADIAN_PROVINCES.put("14", "NU");
        CANADIAN_PROVINCES.put("08", "ON");
        CANADIAN_PROVINCES.put("09", "PE");
        CANADIAN_PROVINCES.put("10", "QC");
        CANADIAN_PROVINCES.put("11", "SK");
        CANADIAN_PROVINCES.put("12", "YT");
    }

    /**
     * @param args the command line arguments
     */
    public static void main(final String[] args) {

        final long start = System.currentTimeMillis();

        buildData("/Users/mdanter/Downloads/US/US.txt", "/tmp/dataUS.txt", POPULATION_THRESHOLD, COUNTRY_CODE_US);
        buildData("/Users/mdanter/Downloads/CA/CA.txt", "/tmp/dataCA.txt", POPULATION_THRESHOLD, COUNTRY_CODE_CA);

        final long end = System.currentTimeMillis();

        Logger.getLogger(GeonamesTransformer.class.getName()).log(Level.INFO, "ETL TOOK: " + (end - start) + "ms");

    }

    /**
     * Designed to read an unzipped geonames file and transform it, while
     * writing only places with population, and only places that have population
     * above a certain size.
     *
     * @param filePath the path to the input text file obtained from geonames by
     * extracting the zip file
     * @param outputPath the path to output the transformed text to
     * @param populationSize determines whether a place is added to output based
     * on its population's size
     * @param countryCode used to designate the country, also used for
     * designating the special case for CANADA
     */
    public static void buildData(final String filePath, 
            final String outputPath, 
            final int populationSize, final String countryCode) {

        FileWriter fileWriter = null;
        BufferedWriter bufferedWriter = null;
        InputStream fStream = null;
        BufferedReader bufferedReader = null;

        try {
            // Open the file
            fStream = new FileInputStream(filePath);

            // Get the object of DataInputStream
            bufferedReader = new BufferedReader(new InputStreamReader(fStream));

            fileWriter = new FileWriter(outputPath);
            bufferedWriter = new BufferedWriter(fileWriter, BUFFER_SIZE);
            
            if (COUNTRY_CODE_CA.equals(countryCode)) {

                transformFileForCA(bufferedReader, populationSize, bufferedWriter);

            } else {

                transformFile(bufferedReader, populationSize, countryCode, bufferedWriter);

            }

        } catch (Exception e) {//Catch exception if any
            Logger.getLogger(GeonamesTransformer.class.getName()).log(Level.SEVERE, e.getMessage());
        } finally {
            try {
                //Close the streams
                fStream.close();
                bufferedWriter.close();
                fileWriter.close();
                bufferedReader.close();
            } catch (Exception ex) {
                Logger.getLogger(GeonamesTransformer.class.getName()).log(Level.SEVERE, ex.getMessage());
            }
        }
    }

    /**
     * Designed to read an unzipped geonames file and transform it, while
     * writing only places with population, and only places that have population
     * above a certain size.
     *
     * @param bufferedReader obtained from geonames file
     * @param populationSize determines whether a place is added to output based
     * on its population's size
     * @param countryCode used to designate the country, also used for
     * designating the special case for CANADA
     * @param bufferedWriter outputs to file designated for transformed content
     * @throws IOException
     * @throws NumberFormatException
     */
    private static void transformFile(final BufferedReader bufferedReader,
            final int populationSize,
            final String countryCode,
            final BufferedWriter bufferedWriter)
            throws IOException, NumberFormatException {

        String strLine;
        StringBuffer transformedLine;
        //Read File Line By Line
        while ((strLine = bufferedReader.readLine()) != null) {

            // Print the content to file
            final String[] cols = strLine.split(SEPARATOR_CHAR);
            if ("P".equalsIgnoreCase(cols[LOCATION_DESIGNATION])) {//we're only interested in places

                final int population = Integer.parseInt(cols[POPULATION_SIZE]);

                if (population > populationSize) {

                    transformedLine = transformLine(cols, countryCode);

                    bufferedWriter.write(transformedLine.toString());
                }
            }
        }
    }

    /**
     * Designed to read an unzipped geonames file and transform it, while
     * writing only places with population, and only places that have population
     * above a certain size, special case for Canada
     *
     * @param bufferedReader obtained from the geonames file
     * @param populationSize determines whether a place is added to output based
     * on its population's size
     * @param bufferedWriter outputs to file designated for transformed content
     * @throws NumberFormatException
     * @throws IOException
     */
    private static void transformFileForCA(final BufferedReader bufferedReader,
            final int populationSize,
            final BufferedWriter bufferedWriter)
            throws NumberFormatException, IOException {

        String strLine;
        StringBuffer transformedLine;
        //Read File Line By Line
        while ((strLine = bufferedReader.readLine()) != null) {

            // Print the content to file
            final String[] cols = strLine.split(SEPARATOR_CHAR);
            if ("P".equalsIgnoreCase(cols[LOCATION_DESIGNATION])) {//we're only interested in places

                final int population = Integer.parseInt(cols[POPULATION_SIZE]);

                if (population > populationSize) {

                    transformedLine = transformLineForCA(cols);

                    bufferedWriter.write(transformedLine.toString());
                }
            }
        }
    }

    /**
     * Builds a line of output for Canada, this is a special case since we need
     * to correlate the numeric provinces with the 2 letter abbreviations for
     * familiarity.
     *
     * @param cols one line of the input geonames file split by t
     * @return output line of the transformed String wrapped in Stringbuffer
     */
    private static StringBuffer transformLineForCA(final String[] cols) {
        final StringBuffer transformedLine = new StringBuffer();

        transformedLine.append(cols[LOCATION_ID]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_CITY]).
                append(SEPARATOR_CHAR).
                append(CANADIAN_PROVINCES.get(cols[LOCATION_STATE])).
                append(SEPARATOR_CHAR).
                append(COUNTRY_CODE_CA).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_LAT]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_LNG]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_TZINFO]).
                append(SEPARATOR_CHAR).
                append(cols[POPULATION_SIZE]).
                append(NEWLINE_CHAR);

        return transformedLine;
    }

    /**
     * Generically builds a line of output.
     *
     * @param cols one line of the input geonames file split by t
     * @param countryCode the country code that is desired in the output
     * @return output line of the transformed String wrapped in Stringbuffer
     */
    private static StringBuffer transformLine(final String[] cols,
            final String countryCode) {

        final StringBuffer transformedLine = new StringBuffer();

        transformedLine.append(cols[LOCATION_ID]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_CITY]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_STATE]).
                append(SEPARATOR_CHAR).
                append(countryCode).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_LAT]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_LNG]).
                append(SEPARATOR_CHAR).
                append(cols[LOCATION_TZINFO]).
                append(SEPARATOR_CHAR).
                append(cols[POPULATION_SIZE]).
                append(NEWLINE_CHAR);

        return transformedLine;
    }
}

After running the class, there should be two files in /tmp, dataUS.txt, dataCa.txt. These are the files to load into the geonames table using the following SQL statement:

LOAD DATA INFILE '/tmp/dataCA.txt' INTO TABLE geonames;
LOAD DATA INFILE '/tmp/dataUS.txt' INTO TABLE geonames;

At this point, we are ready to grab some intel based on the newly available data.

Examples:

GET TIMEZONE INFO BY LAT/LNG:
Where the bound parameters are called ?lat and ?lng, I used this in a native query using JPA.

SELECT 
    timezone 
FROM ( 
    (SELECT 
         *,ABS(latitude-?lat) latd, ABS(longitude - ?lng) lngd  
     FROM 
        geonames  
     WHERE 
        latitude< =?lat AND 
        longitude<=?lng  
     ORDER BY latd ASC, lngd ASC  
     LIMIT 0,1) 
    UNION 
    (SELECT 
         *,ABS(latitude-?lat) latd, ABS(longitude - ?lng) lngd  
     FROM 
         geonames  
     WHERE 
         latitude>=?lat AND longitude>=?lng  
     ORDER BY latd ASC, lngd ASC  
     LIMIT 0,1)
    ) Tb1 
ORDER BY Tb1.lngd ASC 
LIMIT 0,1;

Would you like to find the nearest area with population above 5000 people? Add fields to the outer most SELECT statement.

GET CITIES BY SUBSTRING:

SELECT 
    CONCAT(city_name,', ',state_code) as location 
FROM 
    geonames 
WHERE 
    city_name like '%needle%' 
GROUP BY 
    city_name, state_code 
ORDER BY 
    population DESC;

If you followed this article, you should have a table with useful data, and have basic ideas about how to use it.

openstreetmap json geolocation using java

A viable free alternative to google’s geolocation api is openstreetmap’s nominatim functionality. The service recommends restricting the frequency of requests to 1 per second, so if nothing else, this is a viable failover alternative to google’s geolocation api.

The code below may be used to query openstreetmap’s nominatim database for latitude/longitude information. It uses the jackson library for json parsing.

If you are using maven, add:

<dependency>
    <groupId>org.codehaus.jackson</groupId>
    <artifactId>jackson-jaxrs</artifactId>
    <version>1.9.4</version>
</dependency>

to your pom.xml file’s dependencies to satisfy the import statements in the class below.

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLEncoder;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jackson.map.ObjectMapper;

public class OpenStreetMapGeoCodeJacksonParser {

    private static final String LATITUDE = "lat";
    private static final String LONGITUDE = "lon";

    public LatLng parse(final InputStream jsonStream) {
        LatLng coordinate = null;
        final ObjectMapper mapper = new ObjectMapper();
        try {
            final List<Object> dealData = mapper.readValue(jsonStream, List.class);
            if (dealData != null && dealData.size() == 1) {
                final Map< String, Object > locationMap = (Map< String, Object >) dealData.get(0);
                if (locationMap != null && locationMap.containsKey(LATITUDE) && locationMap.containsKey(LONGITUDE)) {
                    final double lat = Double.parseDouble(locationMap.get(LATITUDE).toString());
                    final double lng = Double.parseDouble(locationMap.get(LONGITUDE).toString());
                    coordinate = new LatLng(lat, lng);
                 }
             } else {
                 Logger.getLogger(OpenStreetMapGeoCodeJacksonParser.class.getName()).log(Level.SEVERE, "NO RESULTS", "NO RESULTS");
             }
         } catch (Exception ex) {
             Logger.getLogger(OpenStreetMapGeoCodeJacksonParser.class.getName()).log(Level.SEVERE, ex.getMessage(), ex);
         }
    return coordinate;
    }

    public LatLng parse(String rawAddress) {
        InputStream is = null;
        LatLng coords = null;

        if (rawAddress != null && rawAddress.length() > 0 ) {
            try {
                String address = URLEncoder.encode(rawAddress, "utf-8");
                String geocodeURL = "http://nominatim.openstreetmap.org/search?format=json&limit=1&polygon=0&addressdetails=0&email=contact@EMAIL.ME&countrycodes=us&q=";
                //query google geocode api
                String formattedUrl = geocodeURL + address;
                URL geocodeUrl = new URL(formattedUrl);
                is = geocodeUrl.openStream();
                coords = parse(is);
            } catch (IOException ex) {
                Logger.getLogger(OpenStreetMapGeoCodeJacksonParser.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                try {
                    is.close();
                } catch (IOException ex) {
                    Logger.getLogger(OpenStreetMapGeoCodeJacksonParser.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }
        return coords;
    }

    public static void main(final String[] args) {
        final String rawAddress = "Los Angeles, CA";
        System.out.println(new OpenStreetMapGeoCodeJacksonParser().parse(rawAddress));
    }
}

Please make sure to substitute a valid email address into the geocodeURL’s email GET parameter.

Finally, the LatLng.java class holds the latitude-longitude information.

import java.io.Serializable;

/**
 *
 * @author mdanter
 */
public class LatLng implements Serializable{

    private static final long serialVersionUID = 16549987563L;

    private double lat;
    private double lng;

    public LatLng(final double lat, final double lng) {
        this.lat = lat;
        this.lng = lng;
    }

    public double getLat() {
        return lat;
    }

    public void setLat(final double lat) {
        this.lat = lat;
    }

    public double getLng() {
        return lng;
    }

    public void setLng(final double lng) {
        this.lng = lng;
    }

    @Override
    public boolean equals(Object obj) {
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        final LatLng other = (LatLng) obj;
        if (Double.doubleToLongBits(this.lat) != Double.doubleToLongBits(other.lat)) {
            return false;
        }
        if (Double.doubleToLongBits(this.lng) != Double.doubleToLongBits(other.lng)) {
            return false;
        }
        return true;
    }

    @Override
    public int hashCode() {
        int hash = 3;
        hash = 53 * hash + (int) (Double.doubleToLongBits(this.lat) ^ (Double.doubleToLongBits(this.lat) >>> 32));
        hash = 53 * hash + (int) (Double.doubleToLongBits(this.lng) ^ (Double.doubleToLongBits(this.lng) >>> 32));
        return hash;
    }

}