All the best paths
When you go from A to B on a map, there is usually a best path to goI mean, that's what satnav is for, right?. But since there are many roads, there must be points where there are two paths that are optimal.
I have this silly idea that I want to find these watershed points and draw a funny little map showing the hills between them. In this post, we'll start looking at OSM data and play around a bit to find the watersheds.
This isn't a finished thing. I played around until I didn't want to any more. What follows is part of a Jupyter notebook in which I tried getting somewhereGetting somewhere! Do you get it?.
The first thing we need to do is get some OSM data. I used the excellent BBike OSM extractor and found me some Düsseldorf.
These are the libraries we're going to use:
from collections import Counter, defaultdict
import tracemalloc
import svgwrite
import pyproj
import numpy as np
First, we're going to read the map data, in its entirety. Note that this may take a little while. Depending on the data file, this might be between 2 and ∞ seconds.
Just out of interest, we're also going to tracemalloc it, just to see how much memory we're using.
filename = 'planet_6.74867,51.21731_6.79149,51.23367.osm.geojson'
bounds = (6.74867, 51.21731), (6.79149, 51.23367)
home = (bounds[0][0] + bounds[1][0]) / 2, (bounds[0][1] + bounds[1][1]) / 2
%%time
import json
tracemalloc.start()
with open(filename) as mapfile:
mapdata = json.load(mapfile)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()
(69823463, 92864418)
CPU times: user 1.7 s, sys: 98.2 ms, total: 1.8 s
Wall time: 1.79 s
700MB of allocations from an 11MB geojson file. Nice!
I flailed around a little bit, trying to find out how to work with the data. I'm only showing a little bit here, just so you can see how professional I am.
features = mapdata["features"]
len(features)
29824
Counter(f["geometry"]["type"] for f in features)
Counter({'Point': 10186, 'LineString': 11594, 'MultiPolygon': 8044})
points = [f for f in features if f["geometry"]["type"] == "Point"]
len(points)
10186
line_strings = [f for f in features if f["geometry"]["type"] == "LineString"]
len(line_strings)
11594
multi_polygons = [f for f in features if f["geometry"]["type"] == "MultiPolygon"]
len(multi_polygons)
8044
Alright, we need to go a bit deeper into the properties:
props = set()
for f in line_strings:
props.update(set(f["properties"].keys()))
types = Counter()
highways = Counter()
junctions = Counter()
for f in line_strings:
types.update([f["properties"].get("type")])
highways.update([f["properties"].get("highway")])
junctions.update([f["properties"].get("junction")])
types, highways, junctions
# turns out, only highways are interesting
highways
Counter({'trunk': 32,
'secondary': 287,
None: 8421,
'footway': 1039,
'residential': 309,
'tertiary': 124,
'track': 19,
'path': 370,
'unclassified': 4,
'living_street': 10,
'cycleway': 14,
'steps': 233,
'service': 511,
'pedestrian': 125,
'primary': 11,
'secondary_link': 4,
'platform': 19,
'trunk_link': 48,
'elevator': 6,
'tertiary_link': 3,
'corridor': 5})
We found out that only highways are interesting. You know, we could have looked at the documentation, too!.
These are the roads we are going to work with from now on:
highways = [f for f in line_strings if f["properties"].get("highway") in (
"motorway", "motorway_link", "primary", "secondary", "trunk", "tertiary", "unclassified",
"residential", "living_street", "pedestrian",
"primary_link", "secondary_link", "tertiary_link",
"bridleway", "platform",
None,
)]
len(highways)
9349
Since we want to produce beautiful maps (or any maps, actually), we're going to need to be able to draw them. We do this by creating SVG files. These can be zoomed and printed and edited, as well as shown inline.
Here's the utility function we're going to use. Note again how professional this is.
def draw_map(roads, points, segments=None, home_point=None):
if segments is None:
segments = []
WIDTH, HEIGHT = 350, 140
dwg = svgwrite.Drawing('svgwrite-example.svg', size=(f"{WIDTH}mm", f"{HEIGHT}mm"))
transg = dwg.add(dwg.g())
g = transg.add(dwg.g())
if points:
bounds_points = (min(p[0][0] for p in points), min(p[0][1] for p in points)), (max(p[0][0] for p in points), max(p[0][1] for p in points))
else:
bounds_points = (100000000, 100000000), (-100000000, -100000000)
if roads:
bounds_lines = (
(
min(min(p[0] for p in g["geometry"]["coordinates"]) for g in roads),
min(min(p[1] for p in g["geometry"]["coordinates"]) for g in roads),
), (
max(max(p[0] for p in g["geometry"]["coordinates"]) for g in roads),
max(max(p[1] for p in g["geometry"]["coordinates"]) for g in roads),
)
)
else:
bounds_lines = (100000000, 100000000), (-100000000, -100000000)
bounds = (
(
min(bounds_points[0][0], bounds_lines[0][0]),
min(bounds_points[0][1], bounds_lines[0][1])
), (
max(bounds_points[1][0], bounds_lines[1][0]),
max(bounds_points[1][1], bounds_lines[1][1]),
)
)
max_y = bounds[1][1]
scale = min(bounds[1][0] - bounds[0][0], bounds[1][1] - bounds[0][1])
PT = scale / WIDTH
# g.translate(-(bounds[0][0] + scale/2), -bounds[0][1] - scale/4)
# scale /= 2
g.translate(-bounds[0][0], 0)
transg.scale((1.4 * WIDTH) / scale, (1.4 * WIDTH) / scale)
# transg.scale(0.01, 0.01)
innerg = g.add(dwg.g())
innerg.scale(1, -1)
# g = innerg
g.fill("none")
g.stroke(color=svgwrite.rgb(200, 200, 200), width=scale / WIDTH, linecap="round", linejoin="round")
g.add(dwg.rect([bounds[0][0], 0], (bounds[1][0] - bounds[0][0], bounds[1][1] - bounds[0][1]), fill='white'))
strokes = {
"motorway": {'color': svgwrite.rgb(100, 10, 16, '%'), 'width': 2 * PT},
"raceway": {'color': svgwrite.rgb(100, 10, 16, '%'), 'width': 1.5 * PT},
"primary": {'color': svgwrite.rgb(10, 10, 80, '%'), 'width': 1 * PT},
"secondary": {'color': svgwrite.rgb(30, 30, 30, '%'), 'width': 1 * PT},
"tertiary": {'color': svgwrite.rgb(80, 80, 80, '%'), 'width': 1 * PT},
}
for highway in roads:
# print(highway["geometry"]["coordinates"])
# print(strokes.get(highway["properties"]["highway"], {}))
coords = highway["geometry"]["coordinates"]
coords = [(c[0], max_y - c[1]) for c in coords]
line = g.add(dwg.polyline(coords))
if stroke := strokes.get(highway["properties"].get("highway")):
line.stroke(**stroke)
for p1, p2, color in segments:
p1 = p1[0], max_y - p1[1]
p2 = p2[0], max_y - p2[1]
s = g.add(dwg.line(p1, p2, stroke=color))
CIRC_R = 0.5 * PT
for (point, color) in points:
point = point[0], max_y - point[1]
color = color if isinstance(color, str) else svgwrite.rgb(*color, '%')
c = g.add(dwg.circle(point, r=CIRC_R, fill=color, stroke='none'))
if home_point:
print(home_point)
point = home_point[0], max_y - home_point[1]
c = g.add(dwg.circle(point, r=5*CIRC_R, fill='#00ffff', stroke='none'))
return dwg
Let's draw a map of our data for the first time:
%%time
points = [
# ((9.08915, 49.04266), (0, 100, 1))
]
highways = [f for f in line_strings if f["properties"].get("highway") in (
"motorway", "motorway_link", "primary", "secondary", "trunk", "tertiary", "unclassified",
"residential", "living_street", "pedestrian",
"primary_link", "secondary_link", "tertiary_link",
"bridleway", "platform",
)]
# bounds = (8.9178,48.9743), (9.2605,49.1111)
w, h = bounds[1][0] - bounds[0][0], bounds[1][1] - bounds[0][1]
for i in range(1, 100):
for j in range(1, 20):
p = (bounds[0][0] + (i / 105) * w , bounds[0][1] + (j / 20) * h), (i, j, 0)
# points.append(p)
draw_map(highways, points)
CPU times: user 65 ms, sys: 191 µs, total: 65.2 ms
Wall time: 64.1 ms
1: move to meter-based coordinates
At some point, we're going to want to measure distances. To do so, we need to convert from WGS84/EPSG:4326 to a more well-suited coordinate system. Apparently EPSG 3035 is the way to go here, since it's accurate for all of Europe.
Or, hmm, we could also try these ones: EPSG:3068, EPSG:5679, or EPSG:7837 or any of 100 others. I don't know what the differences are and I guess they are minimal. I hope they are, anyway.
pts = points[:2]
for pt in pts:
print(pt)
print(pyproj.transform("epsg:4326", "epsg:3035", *pt[0]))
([6.7606583, 51.2234957], (0, 0, 0))
(-513271.50645004585, 9054181.35310964)
([6.7653554, 51.2195793], (0, 0, 0))
(-513019.06060262443, 9053634.032830823)
p = (9.08915, 49.04266)
from pyproj import Proj, transform
p1 = Proj('epsg:3035', preserve_units=True)
p1x = p1(9, 48)
p2x = p1(9, 49)
distance_m = np.sqrt((p1x[0] - p2x[0])**2 + (p1x[1] - p2x[1])**2)
distance_m
111156.71848944735
from pyproj import Transformer
transformer = Transformer.from_crs("epsg:4326", "epsg:3035")
transformer.transform(9, 49)
(-409100.56561648333, 8755738.426043686)
I have no idea what these numbers mean. But ok!
def dist(p1, p2):
return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1]) ** 2)
for hw in highways[:10]:
coords = [transformer.transform(*c) for c in hw["geometry"]["coordinates"]]
l = 0
for i in range(1, len(coords)):
l += dist(coords[i-1], coords[i])
print(l)
602.7340984907527
160.7871203419805
91.58520679722832
34.37434665463454
11.082176061452335
115.85408874142807
150.24235157447842
65.0171774765028
456.24031388355354
110.21961116674738
Those distances look... reasonable?
Here we go, transforming our coordinates
Careful: only run this once, or you'll kill all your coordinates! Why? Because I don't want to spend another 750MB of RAM on geo data again!
# careful: only run this cell once!
for hw in highways:
hw["geometry"]["coordinates"] = [transformer.transform(*c) for c in hw["geometry"]["coordinates"]]
home = transformer.transform(*home)
%%time
points = set()
for hw in highways:
for coord in hw["geometry"]["coordinates"]:
points.add((coord, (0, 0, 0)))
# points.add((home, (0, 100, 1)))
print(len(points))
draw_map(highways, [])
3458
CPU times: user 59.3 ms, sys: 3.83 ms, total: 63.1 ms
Wall time: 61.4 ms
You might notice that the map isn't pointing the same way as before. Yes, well, that's unfortunate, and apparently because these projections use different norths. We won't let that discourage us...
3: the Dijkstra algorithm
Before we can start actually computing distances, we need to find a suitable data structure for our graph.
I guess, the simplest one is this: a dictionary, where the keys are nodes and the entries are lists of (other, distance)
. We'll have to take care that our graph is bi-directional, since we want to make sure that we can go everywhere.
def dist(p1, p2):
return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1]) ** 2)
graph = defaultdict(list)
for hw in highways:
coords = hw["geometry"]["coordinates"]
for i in range(1, len(coords)):
fr, to = coords[i-1], coords[i]
d = dist(fr, to)
graph[fr].append((to, d))
graph[to].append((fr, d))
len(graph)
3458
Ok, so dijkstra, in short: You start at a node, and you mark it as DONE. Then you visit all the nodes you can reach and mark them as active, with their current distance. Then you start visiting nodes: From the list of active nodes you select the one with the smallest distance. Mark that as DONE. Add all nodes you can reach (that are not DONE) as active with current_distance + distance. Repeat until all nodes are visited.
# meh, we'll have to find the closest point to our home so we can start somewhere!
for home_point in points:
break
min_dist = dist(home, home_point[0])
for p in graph:
d = dist(home, p)
if d < min_dist:
home_point, min_dist = p, d
home_point, min_dist, home
((-512198.0851256219, 9054105.365061712),
16.589845930492412,
(-512208.83020984754, 9054118.004925357))
print(graph[home_point])
[((-512192.63521925174, 9054100.065596774), 7.601697709388693), ((-512236.8403923488, 9054071.247561801), 51.63307563186496), ((-512185.5072865789, 9054116.24124847), 16.628092897033614)]
done = dict()
active = dict()
def find_minimum_active(active):
"""Find the minimum active node, remove it from the active set and return it."""
if not active:
return None
pt, min_dist = None, 1_000_000_000
for point, dist in active.items():
if dist < min_dist:
pt, min_dist = point, dist
del active[pt]
return pt, min_dist
done[home_point] = 0
for p, d in graph[home_point]:
active[p] = d
while curr_p := find_minimum_active(active):
# print("---")
curr, d = curr_p
# print(curr)
done[curr] = d
# print("-----")
for next_p, next_d in graph[curr]:
next_d = next_d + d
# print(next_p, next_d)
if next_p in done or (next_p in active and active[next_p] < next_d):
continue
active[next_p] = next_d
len(done)
# [(p, (100, 0, 0)) for p in done]
3202
Ooh, not all points on the map are reachable from our starting point. There's actually quite a lot of roads we can't reach, mainly the southern Rheinbrücke, since the highway on-ramps are not part of my map section.
%%time
svg = draw_map(highways, [(p, (100, 0, 0)) for p in done])
with open("map.svg", "w") as f:
svg.write(f)
svg
CPU times: user 1.11 s, sys: 10.6 ms, total: 1.12 s
Wall time: 1.13 s
All right! We've visited all the reachable points. Now to color them!
4: determine colors based on max_distance/colormap
import colorcet
len(colorcet.fire)
256
We're going to use ColorCET, which is a better color map than traditional flame colors.
max_dist = max(done.values())
slot_dist = 255 / max_dist
def slot(dist):
return int(dist * slot_dist)
swatch = colorcet.colorwheel
points = [(p, swatch[255 - slot(dist)]) for p, dist in done.items()]
svg = draw_map([], points)
with open("map.svg", "w") as f:
svg.write(f)
svg
That's beautiful. But we have not found our watersheds yet. For that, we'll need to actually draw all the paths. Any segment that is not part of a path is then a watershed. And so we need to adapt our dijkstra a little bit.
5: dijkstra with full paths
done = dict()
segments = []
active = dict()
sheds = []
def find_minimum_active(active):
"""Find the minimum active node, remove it from the active set and return it."""
if not active:
return None
pt, min_dist, min_p = None, 1_000_000_000, None
for point, (dist, prev_p) in active.items():
if dist < min_dist:
pt, min_dist, min_p = point, dist, prev_p
del active[pt]
return pt, min_dist, min_p
def middle(p1, p2):
return (p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2
done[home_point] = 0
for p, d in graph[home_point]:
active[p] = (d, home_point)
while curr_p := find_minimum_active(active):
# print("---")
curr, d, prev_p = curr_p
# print(curr)
done[curr] = d
segments.append((prev_p, curr, d))
# print("-----")
for next_p, next_d in graph[curr]:
next_d = next_d + d
# print(next_p, next_d)
if next_p in done:
continue
if next_p in active and active[next_p][0] < next_d:
sheds.append(middle(curr, next_p))
continue
active[next_p] = (next_d, curr)
len(done), len(sheds), len(segments)
(3202, 154, 3201)
%%time
swatch = colorcet.rainbow
# points = [(p, swatch[255 - slot(dist)]) for p, dist in done.items()]
points = [(p, '#ff0000') for p in sheds] + [(home_point, '#00ffff')]
segs = [(p1, p2, swatch[255 - slot(d)]) for p1, p2, d in segments]
svg = draw_map([], points, segs, home_point)
with open("map.svg", "w") as f:
svg.write(f)
svg
(-512198.0851256219, 9054105.365061712)
CPU times: user 549 ms, sys: 390 µs, total: 549 ms
Wall time: 544 ms
And there we have it. All paths you can take in Düsseldorf, with watersheds marked in red. There are many things missing, but I stopped when I reached a nice picture.
I must have made a mistake somewhere because there should not be any closed circles. And also North should be up. And the map should be connected. And my paths should be little arrows to show flow direction. And directional roads should be directional. And footpaths should be different from highways. And the watersheds should be connected and probably hill-shaded, too. And I should show distance markers. And all of this should be much faster. And I'd like this in large-scale printable format. With rivers, and buildings, probably. You know, like in a real map.
So, lots of stuff to do next time.