Geometry-Aware Hashing of GeoJSON objects
26 May 2024While writing a comparator for GeoJSON Feature Collections I encountered an interesting problem:
Whenever you want to compare two (or more) huge lists with each other, you quickly end up using hashes.
You can associate your objects to an hash, put them in a hash map, and lookup values (in this case, duplicates) in O(1)
time, resulting in far less computationally expensive operations.
In GeoJSON each FeatureCollection
(you see this a map, with added Points, Lines, and Areas) contains Features
, which contain Geometry
, which in the case of LineString
and Polygon
are a set of coordinates.
Hashing produces a (expected) unique value for one object.
But the underlying information that a Geometry
encodes is not a fixed-set of coordinates, but rather an area (Polygon
) or a line (Line String
).
A single area (or line) can be expressed in multiple sets of Coordinates, since the direction or order of the underlying vectors are not considered, but the area which they span in the end.
Think of this polygon [A, B, C, D, E, F, A]
It spans the same exact area as this Polygon [D, E, F, A, B, C, D]
In the case of polygons, you can shift your cyclical coordinates however you want.
In LineStrings
you see similiar behaviour . You can read them palindromically.
[A, B, C, D, E]
[E, D, C, B, A]
If your hashing function needs to provide a hash, unique to the shape of your Geometry, not to the particular set of coordinates, you need to be able to consistenly choose a starting point.
The actual part that gets hashed should stay the same, whether or not you enter [A, B, C, D, E, A]
or [C, D , E, A, B, C]
or any other mutations.
To do this, we have to consistenly choose a starting point. After thinking far too long about how I can sort coordinates reliability, I chose the easy way out:
package dev.altayakkus.geoDiff.utils | |
class Hashing { | |
companion object { | |
fun reorderCoordinates(coordinates: List<List<Double>>, lineString: Boolean = false): List<List<Double>> { | |
if (coordinates.isEmpty()) return coordinates | |
var coordinateSet = coordinates | |
// If we have a line string, we need to choose the canonical coordinate from the first and last coordinates | |
if (lineString) { | |
coordinateSet = listOf(coordinates.first(), coordinates.last()) | |
} | |
// Find the canonical coordinate | |
var minCoordinate = coordinateSet.first() | |
for (coordinate in coordinateSet) { | |
if (coordinate[0] < minCoordinate[0]) { | |
minCoordinate = coordinate | |
} else if (coordinate[0] == minCoordinate[0] && coordinate[1] < minCoordinate[1]) { | |
minCoordinate = coordinate | |
} | |
} | |
// Reorder the coordinates so the canonical coordinate is first | |
return if (lineString) { | |
// Reverse the list if the last coordinate is the canonical one | |
if (minCoordinate == coordinates.last()) { | |
coordinates.reversed() | |
} else { | |
coordinates | |
} | |
} else { | |
// Rotate the list for polygons | |
val index = coordinates.indexOf(minCoordinate) | |
val reordered = coordinates.subList(index, coordinates.size - 1) + coordinates.subList(0, index) | |
// Ensure the list is closed by appending the canonical coordinate at the end | |
if (reordered.last() != reordered.first()) { | |
reordered + listOf(reordered.first()) | |
} else { | |
reordered | |
} | |
} | |
} | |
} | |
} |
This function returns the same coordinate for all mutations.
Now we can override our hashCode()
function
package dev.altayakkus.geoDiff | |
import dev.altayakkus.geoDiff.enums.GeometryType | |
import dev.altayakkus.geoDiff.utils.Hashing | |
import org.locationtech.jts.geom.Coordinate | |
import org.locationtech.jts.geom.GeometryFactory | |
import org.locationtech.jts.geom.LinearRing | |
sealed class Geometry() { | |
abstract val type: GeometryType | |
data class Polygon(val coordinates: List<List<List<Double>>>) : Geometry() { | |
init { | |
for (ring in coordinates) { | |
if (ring.size < 4) { | |
throw IllegalArgumentException("A polygon must have at least 4 coordinates.") | |
} | |
if (ring.first() != ring.last()) { | |
throw IllegalArgumentException("The first and last coordinates must be the same.") | |
} | |
val uniqueCoords = ring.dropLast(1).toSet() | |
if (uniqueCoords.size != ring.size - 1) { | |
throw IllegalArgumentException("There should be no duplicate coordinates except the first and last.") | |
} | |
} | |
} | |
override fun equals(other: Any?): Boolean { | |
if (this === other) return true | |
if (other !is Geometry.Polygon) return false | |
// TODO: Support exterior ring items (RFC 7946 3.1.6). Now we throw them away. | |
return Hashing.reorderCoordinates(other.coordinates.flatten()) == Hashing.reorderCoordinates(coordinates.flatten()) | |
} | |
override fun hashCode(): Int { | |
var result = type.hashCode() | |
// Consistent, area-aware reordering | |
// TODO: Support exterior ring items (RFC 7946 3.1.6). Now we throw them away. | |
val reorderedCoords = Hashing.reorderCoordinates(coordinates.flatten()) | |
for (coordinate in reorderedCoords) { | |
val coordinateHash = coordinate.hashCode() | |
result = 31 * result + coordinateHash | |
} | |
return result | |
} | |
fun toJts(): org.locationtech.jts.geom.Geometry { | |
val geometryFactory = GeometryFactory() | |
val jtsCoordinates = coordinates.flatten().map { coord -> | |
Coordinate(coord[0], coord[1]) | |
}.toTypedArray() | |
val shell: LinearRing = geometryFactory.createLinearRing(jtsCoordinates) | |
return geometryFactory.createPolygon(shell, null) | |
} | |
override val type: GeometryType = GeometryType.Polygon | |
} | |
data class LineString(val coordinates: List<List<Double>>) : Geometry() { | |
init { | |
if (coordinates.size < 2) { | |
throw IllegalArgumentException("A line string must have at least 2 coordinates.") | |
} | |
val uniqueCoords = coordinates.toSet() | |
if (uniqueCoords.size != coordinates.size) { | |
throw IllegalArgumentException("There should be no duplicate coordinates.") | |
} | |
} | |
override fun equals(other: Any?): Boolean { | |
if (this === other) return true | |
if (other !is Geometry.LineString) return false | |
return Hashing.reorderCoordinates(other.coordinates, lineString = true) == Hashing.reorderCoordinates(coordinates, lineString = true) | |
} | |
override fun hashCode(): Int { | |
var result = type.hashCode() | |
// Consistent, area-aware reordering | |
val reorderedCoords = Hashing.reorderCoordinates(coordinates, lineString = true) | |
for (coordinate in reorderedCoords) { | |
val coordinateHash = coordinate.hashCode() | |
result = 31 * result + coordinateHash | |
} | |
return result | |
} | |
fun toJts(): org.locationtech.jts.geom.Geometry { | |
val geometryFactory = GeometryFactory() | |
val jtsCoordinates = coordinates.map { coord -> | |
Coordinate(coord[0], coord[1]) | |
}.toTypedArray() | |
return geometryFactory.createLineString(jtsCoordinates) | |
} | |
override val type: GeometryType = GeometryType.LineString | |
} | |
} |
Note: The equals
function override actually checks if the coordinates are the same, because our hashing can lead to collisions
package dev.altayakkus.geoDiff | |
fun main() { | |
val a = listOf(34.21, 2.78) | |
val b = listOf(3458.32, 2131.23) | |
val c = listOf(0.0, 23.11) | |
val d = listOf(0.3432, 3.0) | |
val polygonCoordinates1 = listOf( | |
listOf(a, b, c, d, a) | |
) | |
val polygonCoordinates2 = listOf( | |
listOf(c, d, a, b, c) | |
) | |
val polygonGeometry1 = Geometry.Polygon(polygonCoordinates1) | |
val polygonGeometry2 = Geometry.Polygon(polygonCoordinates2) | |
println(polygonGeometry1 == polygonGeometry2) | |
// outputs: true | |
val lineStringCoordinates1 = listOf( | |
a, b, c, d | |
) | |
val lineStringCoordinates2 = listOf( | |
d, c, b, a | |
) | |
val lineStringGeometry1 = Geometry.LineString(lineStringCoordinates1) | |
val lineStringGeometry2 = Geometry.LineString(lineStringCoordinates2) | |
println(lineStringGeometry1 == lineStringGeometry2) | |
// outputs: true | |
} |