contrib/tzdata/zishrink.awk

   1 # Convert tzdata source into a smaller version of itself.
   2
   3 # Contributed by Paul Eggert.  This file is in the public domain.
   4
   5 # This is not a general-purpose converter; it is designed for current tzdata.
   6 # 'zic' should treat this script's output as if it were identical to
   7 # this script's input.
   8
   9 # Record a hash N for the new name NAME, checking for collisions.
  10
  11 function record_hash(n, name)
  12 {
  13   if (used_hashes[n]) {
  14     printf "# ! collision: %s %s\n", used_hashes[n], name
  15     exit 1
  16   }
  17   used_hashes[n] = name
  18 }
  19
  20 # Return a shortened rule name representing NAME,
  21 # and record this relationship to the hash table.
  22
  23 function gen_rule_name(name, n)
  24 {
  25   # Use a simple memonic: the first two letters.
  26   n = substr(name, 1, 2)
  27   record_hash(n, name)
  28   # printf "# %s = %s\n", n, name
  29   return n
  30 }
  31
  32 function prehash_rule_names(name)
  33 {
  34   # Rule names are not part of the tzdb API, so substitute shorter
  35   # ones.  Shortening them consistently from one release to the next
  36   # simplifies comparison of the output.  That being said, the
  37   # 1-letter names below are not standardized in any way, and can
  38   # change arbitrarily from one release to the next, as the main goal
  39   # here is compression not comparison.
  40
  41   # Abbreviating these rules names to one letter saved the most space
  42   # circa 2018e.
  43   rule["Arg"] = "A"
  44   rule["Brazil"] = "B"
  45   rule["Canada"] = "C"
  46   rule["Denmark"] = "D"
  47   rule["EU"] = "E"
  48   rule["France"] = "F"
  49   rule["GB-Eire"] = "G"
  50   rule["Halifax"] = "H"
  51   rule["Italy"] = "I"
  52   rule["Jordan"] = "J"
  53   rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
  54   rule["Libya"] = "L"
  55   rule["Morocco"] = "M"
  56   rule["Neth"] = "N"
  57   rule["Poland"] = "O" # arbitrary
  58   rule["Palestine"] = "P"
  59   rule["Cuba"] = "Q" # Its start sounds like "Q".
  60   rule["Russia"] = "R"
  61   rule["Syria"] = "S"
  62   rule["Turkey"] = "T"
  63   rule["Uruguay"] = "U"
  64   rule["Vincennes"] = "V"
  65   rule["Winn"] = "W"
  66   rule["Mongol"] = "X" # arbitrary
  67   rule["NT_YK"] = "Y"
  68   rule["Zion"] = "Z"
  69   rule["Austria"] = "a"
  70   rule["Belgium"] = "b"
  71   rule["C-Eur"] = "c"
  72   rule["Algeria"] = "d" # country code DZ
  73   rule["E-Eur"] = "e"
  74   rule["Taiwan"] = "f" # Formosa
  75   rule["Greece"] = "g"
  76   rule["Hungary"] = "h"
  77   rule["Iran"] = "i"
  78   rule["StJohns"] = "j"
  79   rule["Chatham"] = "k" # arbitrary
  80   rule["Lebanon"] = "l"
  81   rule["Mexico"] = "m"
  82   rule["Tunisia"] = "n" # country code TN
  83   rule["Moncton"] = "o" # arbitrary
  84   rule["Port"] = "p"
  85   rule["Albania"] = "q" # arbitrary
  86   rule["Regina"] = "r"
  87   rule["Spain"] = "s"
  88   rule["Toronto"] = "t"
  89   rule["US"] = "u"
  90   rule["Louisville"] = "v" # ville
  91   rule["Iceland"] = "w" # arbitrary
  92   rule["Chile"] = "x" # arbitrary
  93   rule["Para"] = "y" # country code PY
  94   rule["Romania"] = "z" # arbitrary
  95   rule["Macau"] = "_" # arbitrary
  96
  97   # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
  98   # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
  99   rule["Armenia"] = "AM"
 100   rule["Aus"] = "AU"
 101   rule["Azer"] = "AZ"
 102   rule["Barb"] = "BB"
 103   rule["Dhaka"] = "BD"
 104   rule["Bulg"] = "BG"
 105   rule["Bahamas"] = "BS"
 106   rule["Belize"] = "BZ"
 107   rule["Swiss"] = "CH"
 108   rule["Cook"] = "CK"
 109   rule["PRC"] = "CN"
 110   rule["Cyprus"] = "CY"
 111   rule["Czech"] = "CZ"
 112   rule["Germany"] = "DE"
 113   rule["DR"] = "DO"
 114   rule["Ecuador"] = "EC"
 115   rule["Finland"] = "FI"
 116   rule["Fiji"] = "FJ"
 117   rule["Falk"] = "FK"
 118   rule["Ghana"] = "GH"
 119   rule["Guat"] = "GT"
 120   rule["Hond"] = "HN"
 121   rule["Haiti"] = "HT"
 122   rule["Eire"] = "IE"
 123   rule["Iraq"] = "IQ"
 124   rule["Japan"] = "JP"
 125   rule["Kyrgyz"] = "KG"
 126   rule["ROK"] = "KR"
 127   rule["Latvia"] = "LV"
 128   rule["Lux"] = "LX"
 129   rule["Moldova"] = "MD"
 130   rule["Malta"] = "MT"
 131   rule["Mauritius"] = "MU"
 132   rule["Namibia"] = "NA"
 133   rule["Nic"] = "NI"
 134   rule["Norway"] = "NO"
 135   rule["Peru"] = "PE"
 136   rule["Phil"] = "PH"
 137   rule["Pakistan"] = "PK"
 138   rule["Sudan"] = "SD"
 139   rule["Salv"] = "SV"
 140   rule["Tonga"] = "TO"
 141   rule["Vanuatu"] = "VU"
 142
 143   # Avoid collisions.
 144   rule["Detroit"] = "Dt" # De = Denver
 145
 146   for (name in rule) {
 147     record_hash(rule[name], name)
 148   }
 149 }
 150
 151 # Process an input line and save it for later output.
 152
 153 function process_input_line(line, field, end, i, n, startdef)
 154 {
 155   # Remove comments, normalize spaces, and append a space to each line.
 156   sub(/#.*/, "", line)
 157   line = line " "
 158   gsub(/[\t ]+/, " ", line)
 159
 160   # Abbreviate keywords.  Do not abbreviate "Link" to just "L",
 161   # as pre-2017c zic erroneously diagnoses "Li" as ambiguous.
 162   sub(/^Link /, "Li ", line)
 163   sub(/^Rule /, "R ", line)
 164   sub(/^Zone /, "Z ", line)
 165
 166   # SystemV rules are not needed.
 167   if (line ~ /^R SystemV /) return
 168
 169   # Replace FooAsia rules with the same rules without "Asia", as they
 170   # are duplicates.
 171   if (match(line, /[^ ]Asia /)) {
 172     if (line ~ /^R /) return
 173     line = substr(line, 1, RSTART) substr(line, RSTART + 5)
 174   }
 175   # Replace SpainAfrica rules with Morocco, as they are duplicates.
 176   if (match(line, / SpainAfrica /)) {
 177     if (line ~ /^R /) return
 178     line = substr(line, 1, RSTART) "Morocco" substr(line, RSTART + RLENGTH - 1)
 179   }
 180
 181   # Abbreviate times.
 182   while (match(line, /[: ]0+[0-9]/))
 183     line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
 184   while (match(line, /:0[^:]/))
 185     line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
 186
 187   # Abbreviate weekday names.  Do not abbreviate "Sun" and "Sat", as
 188   # pre-2017c zic erroneously diagnoses "Su" and "Sa" as ambiguous.
 189   while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
 190     end = RSTART + RLENGTH
 191     line = substr(line, 1, end - 4) substr(line, end - 1)
 192   }
 193   while (match(line, / (last)?(Tue|Thu)[ <>]/)) {
 194     end = RSTART + RLENGTH
 195     line = substr(line, 1, end - 3) substr(line, end - 1)
 196   }
 197
 198   # Abbreviate "max", "only" and month names.
 199   # Do not abbreviate "min", as pre-2017c zic erroneously diagnoses "mi"
 200   # as ambiguous.
 201   gsub(/ max /, " ma ", line)
 202   gsub(/ only /, " o ", line)
 203   gsub(/ Jan /, " Ja ", line)
 204   gsub(/ Feb /, " F ", line)
 205   gsub(/ Apr /, " Ap ", line)
 206   gsub(/ Aug /, " Au ", line)
 207   gsub(/ Sep /, " S ", line)
 208   gsub(/ Oct /, " O ", line)
 209   gsub(/ Nov /, " N ", line)
 210   gsub(/ Dec /, " D ", line)
 211
 212   # Strip leading and trailing space.
 213   sub(/^ /, "", line)
 214   sub(/ $/, "", line)
 215
 216   # Remove unnecessary trailing zero fields.
 217   sub(/ 0+$/, "", line)
 218
 219   # Remove unnecessary trailing days-of-month "1".
 220   if (match(line, /[A-Za-z] 1$/))
 221     line = substr(line, 1, RSTART)
 222
 223   # Remove unnecessary trailing " Ja" (for January).
 224   sub(/ Ja$/, "", line)
 225
 226   n = split(line, field)
 227
 228   # Abbreviate rule names.
 229   i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2
 230   if (i && field[i] ~ /^[^-+0-9]/) {
 231     if (!rule[field[i]])
 232       rule[field[i]] = gen_rule_name(field[i])
 233     field[i] = rule[field[i]]
 234   }
 235
 236   # If this zone supersedes an earlier one, delete the earlier one
 237   # from the saved output lines.
 238   startdef = ""
 239   if (field[1] == "Z")
 240     zonename = startdef = field[2]
 241   else if (field[1] == "Li")
 242     zonename = startdef = field[3]
 243   else if (field[1] == "R")
 244     zonename = ""
 245   if (startdef) {
 246     i = zonedef[startdef]
 247     if (i) {
 248       do
 249         output_line[i - 1] = ""
 250       while (output_line[i++] ~ /^[-+0-9]/);
 251     }
 252   }
 253   zonedef[zonename] = nout + 1
 254
 255   # Save the line for later output.
 256   line = field[1]
 257   for (i = 2; i <= n; i++)
 258     line = line " " field[i]
 259   output_line[nout++] = line
 260 }
 261
 262 function output_saved_lines(i)
 263 {
 264   for (i = 0; i < nout; i++)
 265     if (output_line[i])
 266       print output_line[i]
 267 }
 268
 269 BEGIN {
 270   # Files that the output normally depends on.
 271   default_dep["africa"] = 1
 272   default_dep["antarctica"] = 1
 273   default_dep["asia"] = 1
 274   default_dep["australasia"] = 1
 275   default_dep["backward"] = 1
 276   default_dep["etcetera"] = 1
 277   default_dep["europe"] = 1
 278   default_dep["factory"] = 1
 279   default_dep["northamerica"] = 1
 280   default_dep["southamerica"] = 1
 281   default_dep["systemv"] = 1
 282   default_dep["ziguard.awk"] = 1
 283   default_dep["zishrink.awk"] = 1
 284
 285   # Output a version string from 'version' and related configuration variables
 286   # supported by tzdb's Makefile.  If you change the makefile or any other files
 287   # that affect the output of this script, you should append '-SOMETHING'
 288   # to the contents of 'version', where SOMETHING identifies what was changed.
 289
 290   ndeps = split(deps, dep)
 291   ddeps = ""
 292   for (i = 1; i <= ndeps; i++) {
 293     if (default_dep[dep[i]]) {
 294       default_dep[dep[i]]++
 295     } else {
 296       ddeps = ddeps " " dep[i]
 297     }
 298   }
 299   for (d in default_dep) {
 300     if (default_dep[d] == 1) {
 301       ddeps = ddeps " !" d
 302     }
 303   }
 304   print "# version", version
 305   if (dataform != "main") {
 306     print "# dataform", dataform
 307   }
 308   if (redo != "posix_right") {
 309     print "# redo " redo
 310   }
 311   if (ddeps) {
 312     print "# ddeps" ddeps
 313   }
 314   print "# This zic input file is in the public domain."
 315
 316   prehash_rule_names()
 317 }
 318
 319 /^[\t ]*[^#\t ]/ {
 320   process_input_line($0)
 321 }
 322
 323 END {
 324   output_saved_lines()
 325 }