Implement downloadable timezone database
[geeqie.git] / scripts / zonedetect / builder.cpp
1 /*
2  * Copyright (c) 2018, Bertold Van den Bergh (vandenbergh@bertold.org)
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the author nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR DISTRIBUTOR BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #include <shapefil.h>
29 #include <iostream>
30 #include <limits>
31 #include <fstream>
32 #include <vector>
33 #include <algorithm>
34 #include <unordered_map>
35 #include <functional>
36 #include <math.h>
37 #include <tuple>
38
39 const double Inf = std::numeric_limits<float>::infinity();
40
41 std::unordered_map<std::string, std::string> alpha2ToName;
42 std::unordered_map<std::string, std::string> tzidToAlpha2;
43
44 void errorFatal(std::string what)
45 {
46     std::cerr<<what<<"\n";
47     exit(1);
48 }
49
50 int encodeVariableLength(std::vector<uint8_t>& output, int64_t valueIn, bool handleNeg = true)
51 {
52     uint64_t value = valueIn * 2;
53     if(valueIn < 0) {
54         value = -valueIn * 2 + 1;
55     }
56
57     if(!handleNeg) {
58         value = valueIn;
59     }
60
61     int bytesUsed = 0;
62     do {
63         uint8_t byteOut = value & 0x7F;
64         if(value >= 128) {
65             byteOut |= 0x80;
66         }
67         output.push_back(byteOut);
68         bytesUsed ++;
69         value >>= 7;
70     } while(value);
71
72     return bytesUsed;
73 }
74
75 int64_t doubleToFixedPoint(double input, double scale, unsigned int precision = 32)
76 {
77     double inputScaled = input / scale;
78     return inputScaled * pow(2, precision-1);
79
80 }
81
82 struct Point {
83     Point(double lat = 0, double lon = 0)
84     {
85         lat_ = lat;
86         lon_ = lon;
87     }
88
89     Point operator-(const Point& p)
90     {
91         Point result(lat_ - p.lat_, lon_ - p.lon_);
92         return result;
93     }
94
95     std::tuple<int64_t, int64_t> toFixedPoint(unsigned int precision = 32)
96     {
97         int64_t latFixedPoint = doubleToFixedPoint(lat_, 90, precision);
98         int64_t lonFixedPoint = doubleToFixedPoint(lon_, 180, precision);
99
100         return std::make_tuple(latFixedPoint, lonFixedPoint);
101     }
102
103     int encodePointBinary(std::vector<uint8_t>& output, unsigned int precision = 32)
104     {
105         int64_t latFixedPoint, lonFixedPoint;
106         std::tie(latFixedPoint, lonFixedPoint) = toFixedPoint(precision);
107
108         int bytesUsed = encodeVariableLength(output, latFixedPoint);
109         bytesUsed += encodeVariableLength(output, lonFixedPoint);
110
111         return bytesUsed;
112     }
113
114     double lat_;
115     double lon_;
116 };
117
118 struct PolygonData {
119     Point boundingMin;
120     Point boundingMax;
121     std::vector<Point> points_;
122     unsigned long fileIndex_ = 0;
123     unsigned long metadataId_;
124
125     void processPoint(const Point& p)
126     {
127         if(p.lat_ < boundingMin.lat_) {
128             boundingMin.lat_ = p.lat_;
129         }
130         if(p.lon_ < boundingMin.lon_) {
131             boundingMin.lon_ = p.lon_;
132         }
133         if(p.lat_ > boundingMax.lat_) {
134             boundingMax.lat_ = p.lat_;
135         }
136         if(p.lon_ > boundingMax.lon_) {
137             boundingMax.lon_ = p.lon_;
138         }
139
140         points_.push_back(p);
141     }
142
143     PolygonData(unsigned long id):
144         boundingMin(Inf, Inf),
145         boundingMax(-Inf, -Inf),
146         metadataId_(id)
147     {
148     }
149
150     long encodeBinaryData(std::vector<uint8_t>& output, unsigned int precision = 20)
151     {
152         long bytesEncoded = 0;
153         bool first = true;
154         int64_t latFixedPoint = 0, lonFixedPoint = 0;
155         int64_t latFixedPointPrev, lonFixedPointPrev;
156         uint64_t vertices = 0;
157
158         std::vector<uint8_t> tmp;
159
160         int64_t diffLatAcc = 0, diffLonAcc = 0, diffLatPrev = 0, diffLonPrev = 0;
161
162         for(Point& point: points_) {
163             /* The points should first be rounded, and then the integer value is differentiated */
164             latFixedPointPrev = latFixedPoint;
165             lonFixedPointPrev = lonFixedPoint;
166             std::tie(latFixedPoint, lonFixedPoint) = point.toFixedPoint(precision);
167
168             int64_t diffLat = latFixedPoint - latFixedPointPrev;
169             int64_t diffLon = lonFixedPoint - lonFixedPointPrev;
170
171             if(first) {
172                 /* First point is always encoded */
173                 vertices++;
174                 encodeVariableLength(tmp, latFixedPoint);
175                 encodeVariableLength(tmp, lonFixedPoint);
176                 first = false;
177             } else {
178                 /* Ignore points that are not different */
179                 if(!diffLon && !diffLat) {
180                     continue;
181                 }
182
183                 if(diffLat != diffLatPrev || diffLon != diffLonPrev) {
184                     /* Encode accumulator */
185                     vertices++;
186                     encodeVariableLength(tmp, diffLatAcc);
187                     encodeVariableLength(tmp, diffLonAcc);
188
189                     diffLatAcc = 0;
190                     diffLonAcc = 0;
191                 }
192
193                 diffLatAcc += diffLat;
194                 diffLonAcc += diffLon;
195             }
196
197             diffLatPrev = diffLat;
198             diffLonPrev = diffLon;
199         }
200
201         /* Encode final point */
202         vertices++;
203         encodeVariableLength(tmp, diffLatAcc);
204         encodeVariableLength(tmp, diffLonAcc);
205
206         encodeVariableLength(output, vertices, false);
207         std::copy(tmp.begin(), tmp.end(), std::back_inserter(output));
208
209         return bytesEncoded;
210     }
211 };
212
213 void encodeStringToBinary(std::vector<uint8_t>& output, std::string& input)
214 {
215     encodeVariableLength(output, input.size(), false);
216     for(unsigned int i=0; i<input.size(); i++) {
217         output.push_back(input[i] ^ 0x80);
218     }
219 }
220
221
222 std::unordered_map<std::string, uint64_t> usedStrings_;
223
224 struct MetaData {
225     void encodeBinaryData(std::vector<uint8_t>& output)
226     {
227         for(std::string& str: data_) {
228             if(str.length() >= 256) {
229                 std::cout << "Metadata string is too long\n";
230                 exit(1);
231             }
232
233             if(!usedStrings_.count(str)) {
234                 usedStrings_[str] = output.size();
235                 encodeStringToBinary(output, str);
236             } else {
237                 encodeVariableLength(output, usedStrings_[str] + 256, false);
238             }
239         }
240     }
241
242     std::vector<std::string> data_;
243
244     unsigned long fileIndex_;
245 };
246
247
248 std::vector<PolygonData*> polygons_;
249 std::vector<MetaData> metadata_;
250 std::vector<std::string> fieldNames_;
251
252
253 unsigned int decodeVariableLength(uint8_t* buffer, int64_t* result, bool handleNeg = true)
254 {
255     int64_t value = 0;
256     unsigned int i=0, shift = 0;
257
258     do {
259         value |= (buffer[i] & 0x7F) << shift;
260         shift += 7;
261     } while(buffer[i++] & 0x80);
262
263     if(!handleNeg) {
264         *result = value;
265     } else {
266         *result = (value & 1)?-(value/2):(value/2);
267     }
268     return i;
269 }
270
271 void readMetaDataTimezone(DBFHandle dataHandle)
272 {
273     /* Specify field names */
274     fieldNames_.push_back("TimezoneIdPrefix");
275     fieldNames_.push_back("TimezoneId");
276     fieldNames_.push_back("CountryAlpha2");
277     fieldNames_.push_back("CountryName");
278
279     /* Parse attribute names */
280     for(int i = 0; i < DBFGetRecordCount(dataHandle); i++) {
281         metadata_[i].data_.resize(4);
282         for(int j = 0; j < DBFGetFieldCount(dataHandle); j++) {
283             char fieldTitle[12];
284             int fieldWidth, fieldDecimals;
285             DBFFieldType eType = DBFGetFieldInfo(dataHandle, j, fieldTitle, &fieldWidth, &fieldDecimals);
286
287             fieldTitle[11] = 0;
288             std::string fieldTitleStr(fieldTitle);
289
290             if( eType == FTString ) {
291                 if(fieldTitleStr == "tzid") {
292                     std::string data = DBFReadStringAttribute(dataHandle, i, j);
293                     size_t pos = data.find('/');
294                     if (pos == std::string::npos) {
295                         metadata_[i].data_.at(0) = data;
296                     } else {
297                         metadata_[i].data_.at(0) = data.substr(0, pos) + "/";
298                         metadata_[i].data_.at(1) = data.substr(pos + 1, std::string::npos);
299                     }
300                     if(tzidToAlpha2.count(data)) {
301                         metadata_[i].data_.at(2) = tzidToAlpha2[data];
302                         if(alpha2ToName.count(metadata_[i].data_.at(2))) {
303                             metadata_[i].data_.at(3) = alpha2ToName[metadata_[i].data_.at(2)];
304                         } else {
305                             std::cout<<metadata_[i].data_.at(2)<< " not found in alpha2ToName! ("<<data<<")\n";
306                         }
307                     } else {
308                         std::cout<<data<<" not found in zoneToAlpha2!\n";
309                     }
310                 }
311             }
312         }
313     }
314 }
315
316 void readMetaDataNaturalEarthCountry(DBFHandle dataHandle)
317 {
318     /* Specify field names */
319     fieldNames_.push_back("Alpha2");
320     fieldNames_.push_back("Alpha3");
321     fieldNames_.push_back("Name");
322
323     /* Parse attribute names */
324     for(int i = 0; i < DBFGetRecordCount(dataHandle); i++) {
325         metadata_[i].data_.resize(3);
326         for(int j = 0; j < DBFGetFieldCount(dataHandle); j++) {
327             char fieldTitle[12];
328             int fieldWidth, fieldDecimals;
329             DBFFieldType eType = DBFGetFieldInfo(dataHandle, j, fieldTitle, &fieldWidth, &fieldDecimals);
330
331             fieldTitle[11] = 0;
332             std::string fieldTitleStr(fieldTitle);
333
334             if( eType == FTString ) {
335                 if(fieldTitleStr == "ISO_A2" || fieldTitleStr == "WB_A2") {
336                     std::string tmp = DBFReadStringAttribute(dataHandle, i, j);
337                     if(tmp != "-99") {
338                         metadata_[i].data_.at(0) = tmp;
339                     }
340                 } else if(fieldTitleStr == "ISO_A3" || fieldTitleStr == "WB_A3" || fieldTitleStr == "BRK_A3") {
341                     std::string tmp = DBFReadStringAttribute(dataHandle, i, j);
342                     if(tmp != "-99") {
343                         metadata_[i].data_.at(1) = tmp;
344                     }
345                 } else if(fieldTitleStr == "NAME_LONG") {
346                     metadata_[i].data_.at(2) = DBFReadStringAttribute(dataHandle, i, j);
347                 }
348             }
349
350         }
351     }
352 }
353
354 std::unordered_map<std::string, std::string> parseAlpha2ToName(DBFHandle dataHandle)
355 {
356     std::unordered_map<std::string, std::string> result;
357
358     for(int i = 0; i < DBFGetRecordCount(dataHandle); i++) {
359         std::string alpha2, name;
360         for(int j = 0; j < DBFGetFieldCount(dataHandle); j++) {
361             char fieldTitle[12];
362             int fieldWidth, fieldDecimals;
363             DBFFieldType eType = DBFGetFieldInfo(dataHandle, j, fieldTitle, &fieldWidth, &fieldDecimals);
364
365             fieldTitle[11] = 0;
366             std::string fieldTitleStr(fieldTitle);
367
368             if( eType == FTString ) {
369                 if(fieldTitleStr == "ISO_A2" || fieldTitleStr == "WB_A2") {
370                     std::string tmp = DBFReadStringAttribute(dataHandle, i, j);
371                     if(tmp != "-99" && alpha2 == "") {
372                         alpha2 = tmp;
373                     }
374                 } else if(fieldTitleStr == "NAME_LONG") {
375                     name = DBFReadStringAttribute(dataHandle, i, j);
376                 }
377             }
378         }
379         if(alpha2 != "") {
380             result[alpha2]=name;
381         }
382     }
383
384     result["GF"]="French Guiana";
385     result["GP"]="Guadeloupe";
386     result["BQ"]="Bonaire";
387     result["MQ"]="Martinique";
388     result["SJ"]="Svalbard and Jan Mayen Islands";
389     result["NO"]="Norway";
390     result["CX"]="Christmas Island";
391     result["CC"]="Cocos Islands";
392     result["YT"]="Mayotte";
393     result["RE"]="RĂ©union";
394     result["TK"]="Tokelau";
395
396     return result;
397 }
398
399 std::unordered_map<std::string, std::string> parseTimezoneToAlpha2(std::string path)
400 {
401     std::unordered_map<std::string, std::string> result;
402     //TODO: Clean solution...
403 #include "zoneToAlpha.h"
404
405     return result;
406 }
407
408 int main(int argc, char ** argv )
409 {
410     if(argc != 6) {
411         std::cout << "Wrong number of parameters\n";
412         return 1;
413     }
414
415     tzidToAlpha2 = parseTimezoneToAlpha2("TODO");
416
417     char tableType = argv[1][0];
418     std::string path = argv[2];
419     std::string outPath = argv[3];
420     unsigned int precision = strtol(argv[4], NULL, 10);
421     std::string notice = argv[5];
422
423     DBFHandle dataHandle = DBFOpen("naturalearth/ne_10m_admin_0_countries_lakes", "rb" );
424     alpha2ToName = parseAlpha2ToName(dataHandle);
425     DBFClose(dataHandle);
426
427     dataHandle = DBFOpen(path.c_str(), "rb" );
428     if( dataHandle == NULL ) {
429         errorFatal("Could not open attribute file\n");
430     }
431
432     metadata_.resize(DBFGetRecordCount(dataHandle));
433     std::cout << "Reading "<<metadata_.size()<<" metadata records.\n";
434
435     if(tableType == 'C') {
436         readMetaDataNaturalEarthCountry(dataHandle);
437     } else if(tableType == 'T') {
438         readMetaDataTimezone(dataHandle);
439     } else {
440         std::cout << "Unknown table type\n";
441         return 1;
442     }
443
444     DBFClose(dataHandle);
445
446     SHPHandle shapeHandle = SHPOpen(path.c_str(), "rb");
447     if( shapeHandle == NULL ) {
448         errorFatal("Could not open shapefile\n");
449     }
450
451     int numEntities, shapeType, totalPolygons = 0;
452     SHPGetInfo(shapeHandle, &numEntities, &shapeType, NULL, NULL);
453
454     std::cout<<"Opened "<<SHPTypeName( shapeType )<< " file with "<<numEntities<<" entries.\n";
455
456     for(int i = 0; i < numEntities; i++ ) {
457         SHPObject *shapeObject;
458
459         shapeObject = SHPReadObject( shapeHandle, i );
460         if(shapeObject) {
461             if(shapeObject->nSHPType != 3 && shapeObject->nSHPType != 5 &&
462                     shapeObject->nSHPType != 13 && shapeObject->nSHPType != 15) {
463                 std::cout<<"Unsupported shape object ("<< SHPTypeName(shapeObject->nSHPType) <<")\n";
464                 continue;
465             }
466
467             int partIndex = 0;
468
469             PolygonData* polygonData = nullptr;
470
471             for(int j = 0; j < shapeObject->nVertices; j++ ) {
472                 if(j == 0 || j == shapeObject->panPartStart[partIndex]) {
473                     totalPolygons++;
474
475                     if(polygonData) {
476                         /* Commit it */
477                         polygons_.push_back(polygonData);
478                     }
479                     polygonData =  new PolygonData(i);
480
481                     if(partIndex + 1 < shapeObject->nParts) {
482                         partIndex++;
483                     }
484                 }
485
486                 Point p(shapeObject->padfY[j], shapeObject->padfX[j]);
487                 polygonData->processPoint(p);
488
489             }
490
491             if(polygonData) {
492                 /* Commit it */
493                 polygons_.push_back(polygonData);
494             }
495
496             SHPDestroyObject(shapeObject);
497         }
498     }
499
500     SHPClose(shapeHandle);
501
502     std::cout<<"Parsed "<<totalPolygons<<" polygons.\n";
503
504     /* Sort according to bounding box */
505     std::sort(polygons_.begin(), polygons_.end(), [](PolygonData* a, PolygonData* b) {
506         return a->boundingMin.lat_ < b->boundingMin.lat_;
507     });
508
509     /* Encode data section and store pointers */
510     std::vector<uint8_t> outputData;
511     for(PolygonData* polygon: polygons_) {
512         polygon->fileIndex_ = outputData.size();
513         polygon->encodeBinaryData(outputData, precision);
514     }
515     std::cout << "Encoded data section into "<<outputData.size()<<" bytes.\n";
516
517     /* Encode metadata */
518     std::vector<uint8_t> outputMeta;
519     for(MetaData& metadata: metadata_) {
520         metadata.fileIndex_ = outputMeta.size();
521         metadata.encodeBinaryData(outputMeta);
522     }
523     std::cout << "Encoded metadata into "<<outputMeta.size()<<" bytes.\n";
524
525     /* Encode bounding boxes */
526     std::vector<uint8_t> outputBBox;
527     int64_t prevFileIndex = 0;
528     int64_t prevMetaIndex = 0;
529     for(PolygonData* polygon: polygons_) {
530         polygon->boundingMin.encodePointBinary(outputBBox, precision);
531         polygon->boundingMax.encodePointBinary(outputBBox, precision);
532
533         encodeVariableLength(outputBBox, metadata_.at(polygon->metadataId_).fileIndex_ - prevMetaIndex);
534         prevMetaIndex = metadata_[polygon->metadataId_].fileIndex_;
535
536         encodeVariableLength(outputBBox, polygon->fileIndex_ - prevFileIndex, false);
537         prevFileIndex = polygon->fileIndex_;
538     }
539     std::cout << "Encoded bounding box section into "<<outputBBox.size()<<" bytes.\n";
540
541     /* Encode header */
542     std::vector<uint8_t> outputHeader;
543     outputHeader.push_back('P');
544     outputHeader.push_back('L');
545     outputHeader.push_back('B');
546     outputHeader.push_back(tableType);
547     outputHeader.push_back(0);
548     outputHeader.push_back(precision);
549     outputHeader.push_back(fieldNames_.size());
550     for(unsigned int i=0; i<fieldNames_.size(); i++) {
551         encodeStringToBinary(outputHeader, fieldNames_[i]);
552     }
553     encodeStringToBinary(outputHeader, notice);
554     encodeVariableLength(outputHeader, outputBBox.size(), false);
555     encodeVariableLength(outputHeader, outputMeta.size(), false);
556     encodeVariableLength(outputHeader, outputData.size(), false);
557     std::cout << "Encoded header into "<<outputHeader.size()<<" bytes.\n";
558
559     FILE* outputFile = fopen(outPath.c_str(), "wb");
560     fwrite(outputHeader.data(), 1, outputHeader.size(), outputFile);
561     fwrite(outputBBox.data(), 1, outputBBox.size(), outputFile);
562     fwrite(outputMeta.data(), 1, outputMeta.size(), outputFile);
563     fwrite(outputData.data(), 1, outputData.size(), outputFile);
564     fclose(outputFile);
565
566 }