contrib/tzdata/zishrink.awk

   1 # Convert tzdata source into a smaller version of itself.
   2
   3 # Contributed by Paul Eggert.  This file is in the public domain.
   4
   5 # This is not a general-purpose converter; it is designed for current tzdata.
   6 # 'zic' should treat this script's output as if it were identical to
   7 # this script's input.
   8
   9 # Record a hash N for the new name NAME, checking for collisions.
  10
  11 function record_hash(n, name)
  12 {
  13   if (used_hashes[n]) {
  14     printf "# ! collision: %s %s\n", used_hashes[n], name
  15     exit 1
  16   }
  17   used_hashes[n] = name
  18 }
  19
  20 # Return a shortened rule name representing NAME,
  21 # and record this relationship to the hash table.
  22
  23 function gen_rule_name(name, \
  24                        n)
  25 {
  26   # Use a simple mnemonic: the first two letters.
  27   n = substr(name, 1, 2)
  28   record_hash(n, name)
  29   # printf "# %s = %s\n", n, name
  30   return n
  31 }
  32
  33 function prehash_rule_names( \
  34                             name)
  35 {
  36   # Rule names are not part of the tzdb API, so substitute shorter
  37   # ones.  Shortening them consistently from one release to the next
  38   # simplifies comparison of the output.  That being said, the
  39   # 1-letter names below are not standardized in any way, and can
  40   # change arbitrarily from one release to the next, as the main goal
  41   # here is compression not comparison.
  42
  43   # Abbreviating these rules names to one letter saved the most space
  44   # circa 2018e.
  45   rule["Arg"] = "A"
  46   rule["Brazil"] = "B"
  47   rule["Canada"] = "C"
  48   rule["Denmark"] = "D"
  49   rule["EU"] = "E"
  50   rule["France"] = "F"
  51   rule["GB-Eire"] = "G"
  52   rule["Halifax"] = "H"
  53   rule["Italy"] = "I"
  54   rule["Jordan"] = "J"
  55   rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
  56   rule["Libya"] = "L"
  57   rule["Morocco"] = "M"
  58   rule["Neth"] = "N"
  59   rule["Poland"] = "O" # arbitrary
  60   rule["Palestine"] = "P"
  61   rule["Cuba"] = "Q" # Its start sounds like "Q".
  62   rule["Russia"] = "R"
  63   rule["Syria"] = "S"
  64   rule["Turkey"] = "T"
  65   rule["Uruguay"] = "U"
  66   rule["Vincennes"] = "V"
  67   rule["Winn"] = "W"
  68   rule["Mongol"] = "X" # arbitrary
  69   rule["NT_YK"] = "Y"
  70   rule["Zion"] = "Z"
  71   rule["Austria"] = "a"
  72   rule["Belgium"] = "b"
  73   rule["C-Eur"] = "c"
  74   rule["Algeria"] = "d" # country code DZ
  75   rule["E-Eur"] = "e"
  76   rule["Taiwan"] = "f" # Formosa
  77   rule["Greece"] = "g"
  78   rule["Hungary"] = "h"
  79   rule["Iran"] = "i"
  80   rule["StJohns"] = "j"
  81   rule["Chatham"] = "k" # arbitrary
  82   rule["Lebanon"] = "l"
  83   rule["Mexico"] = "m"
  84   rule["Tunisia"] = "n" # country code TN
  85   rule["Moncton"] = "o" # arbitrary
  86   rule["Port"] = "p"
  87   rule["Albania"] = "q" # arbitrary
  88   rule["Regina"] = "r"
  89   rule["Spain"] = "s"
  90   rule["Toronto"] = "t"
  91   rule["US"] = "u"
  92   rule["Louisville"] = "v" # ville
  93   rule["Iceland"] = "w" # arbitrary
  94   rule["Chile"] = "x" # arbitrary
  95   rule["Para"] = "y" # country code PY
  96   rule["Romania"] = "z" # arbitrary
  97   rule["Macau"] = "_" # arbitrary
  98
  99   # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
 100   # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
 101   rule["Armenia"] = "AM"
 102   rule["Aus"] = "AU"
 103   rule["Azer"] = "AZ"
 104   rule["Barb"] = "BB"
 105   rule["Dhaka"] = "BD"
 106   rule["Bulg"] = "BG"
 107   rule["Bahamas"] = "BS"
 108   rule["Belize"] = "BZ"
 109   rule["Swiss"] = "CH"
 110   rule["Cook"] = "CK"
 111   rule["PRC"] = "CN"
 112   rule["Cyprus"] = "CY"
 113   rule["Czech"] = "CZ"
 114   rule["Germany"] = "DE"
 115   rule["DR"] = "DO"
 116   rule["Ecuador"] = "EC"
 117   rule["Finland"] = "FI"
 118   rule["Fiji"] = "FJ"
 119   rule["Falk"] = "FK"
 120   rule["Ghana"] = "GH"
 121   rule["Guat"] = "GT"
 122   rule["Hond"] = "HN"
 123   rule["Haiti"] = "HT"
 124   rule["Eire"] = "IE"
 125   rule["Iraq"] = "IQ"
 126   rule["Japan"] = "JP"
 127   rule["Kyrgyz"] = "KG"
 128   rule["ROK"] = "KR"
 129   rule["Latvia"] = "LV"
 130   rule["Lux"] = "LX"
 131   rule["Moldova"] = "MD"
 132   rule["Malta"] = "MT"
 133   rule["Mauritius"] = "MU"
 134   rule["Namibia"] = "NA"
 135   rule["Nic"] = "NI"
 136   rule["Norway"] = "NO"
 137   rule["Peru"] = "PE"
 138   rule["Phil"] = "PH"
 139   rule["Pakistan"] = "PK"
 140   rule["Sudan"] = "SD"
 141   rule["Salv"] = "SV"
 142   rule["Tonga"] = "TO"
 143   rule["Vanuatu"] = "VU"
 144
 145   # Avoid collisions.
 146   rule["Detroit"] = "Dt" # De = Denver
 147
 148   for (name in rule) {
 149     record_hash(rule[name], name)
 150   }
 151 }
 152
 153 function make_line(n, field, \
 154                    f, r)
 155 {
 156   r = field[1]
 157   for (f = 2; f <= n; f++)
 158     r = r " " field[f]
 159   return r
 160 }
 161
 162 # Process the input line LINE and save it for later output.
 163
 164 function process_input_line(line, \
 165                             f, field, end, n, outline, r, \
 166                             linkline, ruleline, zoneline)
 167 {
 168   # Remove comments, normalize spaces, and append a space to each line.
 169   sub(/#.*/, "", line)
 170   line = line " "
 171   gsub(/[\t ]+/, " ", line)
 172
 173   # Abbreviate keywords and determine line type.
 174   linkline = sub(/^Link /, "L ", line)
 175   ruleline = sub(/^Rule /, "R ", line)
 176   zoneline = sub(/^Zone /, "Z ", line)
 177
 178   # Replace FooAsia rules with the same rules without "Asia", as they
 179   # are duplicates.
 180   if (match(line, /[^ ]Asia /)) {
 181     if (ruleline) return
 182     line = substr(line, 1, RSTART) substr(line, RSTART + 5)
 183   }
 184
 185   # Abbreviate times.
 186   while (match(line, /[: ]0+[0-9]/))
 187     line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
 188   while (match(line, /:0[^:]/))
 189     line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
 190
 191   # Abbreviate weekday names.
 192   while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
 193     end = RSTART + RLENGTH
 194     line = substr(line, 1, end - 4) substr(line, end - 1)
 195   }
 196   while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) {
 197     end = RSTART + RLENGTH
 198     line = substr(line, 1, end - 3) substr(line, end - 1)
 199   }
 200
 201   # Abbreviate "max", "min", "only" and month names.
 202   # Although "max" and "min" can both be abbreviated to just "m",
 203   # the longer forms "ma" and "mi" are needed with zic 2023d and earlier.
 204   gsub(/ max /, dataform == "vanguard" ? " m " : " ma ", line)
 205   gsub(/ min /, dataform == "vanguard" ? " m " : " mi ", line)
 206   gsub(/ only /, " o ", line)
 207   gsub(/ Jan /, " Ja ", line)
 208   gsub(/ Feb /, " F ", line)
 209   gsub(/ Apr /, " Ap ", line)
 210   gsub(/ Aug /, " Au ", line)
 211   gsub(/ Sep /, " S ", line)
 212   gsub(/ Oct /, " O ", line)
 213   gsub(/ Nov /, " N ", line)
 214   gsub(/ Dec /, " D ", line)
 215
 216   # Strip leading and trailing space.
 217   sub(/^ /, "", line)
 218   sub(/ $/, "", line)
 219
 220   # Remove unnecessary trailing zero fields.
 221   sub(/ 0+$/, "", line)
 222
 223   # Remove unnecessary trailing days-of-month "1".
 224   if (match(line, /[A-Za-z] 1$/))
 225     line = substr(line, 1, RSTART)
 226
 227   # Remove unnecessary trailing " Ja" (for January).
 228   sub(/ Ja$/, "", line)
 229
 230   n = split(line, field)
 231
 232   # Record which rule names are used, and generate their abbreviations.
 233   f = zoneline ? 4 : linkline || ruleline ? 0 : 2
 234   r = field[f]
 235   if (r ~ /^[^-+0-9]/) {
 236     rule_used[r] = 1
 237   }
 238
 239   if (zoneline)
 240     zonename = startdef = field[2]
 241   else if (linkline)
 242     zonename = startdef = field[3]
 243   else if (ruleline)
 244     zonename = ""
 245
 246   # Save the information for later output.
 247   outline = make_line(n, field)
 248   if (ruleline)
 249     rule_output_line[nrule_out++] = outline
 250   else if (linkline) {
 251     # In vanguard format with Gawk, links are output sorted by destination.
 252     if (dataform == "vanguard" && PROCINFO["version"])
 253       linkdef[zonename] = field[2]
 254     else
 255       link_output_line[nlink_out++] = outline
 256   }else
 257     zonedef[zonename] = (zoneline ? "" : zonedef[zonename] "\n") outline
 258 }
 259
 260 function omit_unused_rules( \
 261                            i, field)
 262 {
 263   for (i = 0; i < nrule_out; i++) {
 264     split(rule_output_line[i], field)
 265     if (!rule_used[field[2]])
 266       rule_output_line[i] = ""
 267   }
 268 }
 269
 270 function abbreviate_rule_names( \
 271                                abbr, f, field, i, n, newdef, newline, r, \
 272                                zoneline, zonelines, zonename)
 273 {
 274   for (i = 0; i < nrule_out; i++) {
 275     n = split(rule_output_line[i], field)
 276     if (n) {
 277       r = field[2]
 278       if (r ~ /^[^-+0-9]/) {
 279         abbr = rule[r]
 280         if (!abbr) {
 281           rule[r] = abbr = gen_rule_name(r)
 282         }
 283         field[2] = abbr
 284         rule_output_line[i] = make_line(n, field)
 285       }
 286     }
 287   }
 288   for (zonename in zonedef) {
 289     zonelines = split(zonedef[zonename], zoneline, /\n/)
 290     newdef = ""
 291     for (i = 1; i <= zonelines; i++) {
 292       newline = zoneline[i]
 293       n = split(newline, field)
 294       f = i == 1 ? 4 : 2
 295       r = rule[field[f]]
 296       if (r) {
 297         field[f] = r
 298         newline = make_line(n, field)
 299       }
 300       newdef = (newdef ? newdef "\n" : "") newline
 301     }
 302     zonedef[zonename] = newdef
 303   }
 304 }
 305
 306 function output_saved_lines( \
 307                             i, zonename)
 308 {
 309   for (i = 0; i < nrule_out; i++)
 310     if (rule_output_line[i])
 311       print rule_output_line[i]
 312
 313   # When using gawk, output zones sorted by name.
 314   # This makes the output a bit more compressible.
 315   PROCINFO["sorted_in"] = "@ind_str_asc"
 316   for (zonename in zonedef)
 317     print zonedef[zonename]
 318
 319   if (nlink_out)
 320     for (i = 0; i < nlink_out; i++)
 321       print link_output_line[i]
 322   else {
 323     # When using gawk, output links sorted by destination.
 324     # This also helps compressibility a bit.
 325     PROCINFO["sorted_in"] = "@val_type_asc"
 326     for (zonename in linkdef)
 327       printf "L %s %s\n", linkdef[zonename], zonename
 328   }
 329 }
 330
 331 BEGIN {
 332   # Files that the output normally depends on.
 333   default_dep["africa"] = 1
 334   default_dep["antarctica"] = 1
 335   default_dep["asia"] = 1
 336   default_dep["australasia"] = 1
 337   default_dep["backward"] = 1
 338   default_dep["etcetera"] = 1
 339   default_dep["europe"] = 1
 340   default_dep["factory"] = 1
 341   default_dep["northamerica"] = 1
 342   default_dep["southamerica"] = 1
 343   default_dep["ziguard.awk"] = 1
 344   default_dep["zishrink.awk"] = 1
 345
 346   # Output a version string from 'version' and related configuration variables
 347   # supported by tzdb's Makefile.  If you change the makefile or any other files
 348   # that affect the output of this script, you should append '-SOMETHING'
 349   # to the contents of 'version', where SOMETHING identifies what was changed.
 350
 351   ndeps = split(deps, dep)
 352   ddeps = ""
 353   for (i = 1; i <= ndeps; i++) {
 354     if (default_dep[dep[i]]) {
 355       default_dep[dep[i]]++
 356     } else {
 357       ddeps = ddeps " " dep[i]
 358     }
 359   }
 360   for (d in default_dep) {
 361     if (default_dep[d] == 1) {
 362       ddeps = ddeps " !" d
 363     }
 364   }
 365   print "# version", version
 366   if (dataform != "main") {
 367     print "# dataform", dataform
 368   }
 369   if (redo != "posix_right") {
 370     print "# redo " redo
 371   }
 372   if (ddeps) {
 373     print "# ddeps" ddeps
 374   }
 375   print "# This zic input file is in the public domain."
 376
 377   prehash_rule_names()
 378 }
 379
 380 /^[\t ]*[^#\t ]/ {
 381   process_input_line($0)
 382 }
 383
 384 END {
 385   omit_unused_rules()
 386   abbreviate_rule_names()
 387   output_saved_lines()
 388 }