akimbo

#!/usr/bin/perl
# vim: set sw=3 ai sm:

# Script to display Akimbo listings which might be calls for submissions; listings are sorted by deadline
# Copyright ©2020–2021 by Ambrose Li

use strict;
use integer;

# see https://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default for these crazy hacks
# THERE ARE MORE CRAZY HACKS THERE FOR MORE COMPLEX PROJECTS!!
#use v5.14; # //u modifier
use utf8;
use open qw( :encoding(UTF-8) :std );
use charnames qw( :full :short );
use feature "unicode_strings";

use Getopt::Long;
use Data::Dumper;
use POSIX;
use JSON::PP;

use vars qw( $debug_p );
use vars qw( $dry_run_p );
use vars qw( $almost_all_p );
use vars qw( $all_p );
use vars qw( $disable_multi_deadline_hack_p );

use vars qw( $rc_path );
use vars qw( $cache_root );
use vars qw( $listings_cache );
use vars qw( $posts_cache );
use vars qw( $feed_url );
use vars qw( $ua_identity );

# START OF CONFIGURABLE OPTIONS
# -----------------------------------------------------------------------------

$rc_path = "$ENV{HOME}/.akimbo";
$cache_root = "$ENV{HOME}/.cache/akimbo";

$listings_cache = 'listings';
$posts_cache = 'posts';

#$feed_url = 'https://akimbo.ca/listings/feed/';				# random bag of stuff - feed disappeared on 2020/02/14
#$feed_url = 'https://akimbo.ca/listings/?fwp_listing_type=calls';	# calls only - in theory
$feed_url = 'https://akimbo.ca/listings/';

$ua_identity = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36';

# -----------------------------------------------------------------------------
# END OF CONFIGURABLE OPTIONS

sub insert_random_delay () {
   sleep(5 + rand 5);
}

sub read_ini_file ($) {
   my($path) = @_;
   my $it = {};
   if (open(INPUT, '<', $path)) {
      print STDERR "read_ini_file: opened ini file \"$path\"\n" if $debug_p;
      my $section;
      for (;;) {
	 my $s = scalar <INPUT>;
      last unless defined $s;
	 chomp $s;
	 print STDERR "read_ini_file: s=($s)\n" if $debug_p > 1;
	 if ($s =~ /^\s*(?:[;#]|\/\/)/) {
	    ;
	 } elsif ($s =~ /^\s*\[\s*([^\[\]]+)\s*\]\s*$/) {
	    $section = $1;
	 } elsif ($s =~ /^\s*((?:(?!\s*=).)+)\s*=\s*(.*?)\s*$/) {
	    push @{$it->{$section}->{$1}}, $2;
	 } elsif ($s =~ /^\s*$/) {
	    ;
	 } else {
	    warn "$path: $.: Unrecognized input \"$s\"\n";
	 }
      }
      close INPUT;
      print STDERR "read_ini_file: closed ini file\n" if $debug_p;
   }
   return $it;
}

sub fix_cache_paths () {
   if (defined $cache_root) {
      my @cache_root = split(/\/+/, $cache_root);
      my @dirs = map { join('', map { "/$_" } @cache_root[0..$_]) } (0..scalar @cache_root);
      $listings_cache = "$cache_root/$listings_cache";
      $posts_cache = "$cache_root/$posts_cache";
      push @dirs, $listings_cache;
      push @dirs, $posts_cache;
      for my $dir (@dirs) {
	 if (!-d $dir) {
	    print STDERR "fix_cache_paths: creating directory \"$dir\"...\n" if $debug_p;
	    mkdir $dir || warn "Warning: $dir: mkdir: $!\n";
	 }
      }
   }
}

sub sanitize_title ($;$) {
   my($s, $deadlines) = @_;
   local($_, $`, $&, $', $1, $2, $3);
   $s =~ s/<[^<>]*>//sg;
   $s =~ s/[\r\n]+/ /sg;
   $s =~ s/&#8211;/–/sg;
   $s =~ s/&amp;/\&/sg;
   $s =~ s/\s*\| Akimbo$//s;
   $s =~ s/ opens today\s*!?\s*$//si;

   # Attempt to check if we can hack things off the end - this is a stupid HACK
   if (defined $deadlines && ref $deadlines eq 'ARRAY') {
      my $n = 0;
      my $det = $s . '';
      for my $i (@$deadlines) {
	 my $re = $i =~ /\t/? $': $i;
	 if ($det =~ /\s+$re\s*(?:\||$)/s) {
	    $det = $` . $';
	    $n += 1;
	 }
      }
      $s = $det if $n == scalar @$deadlines;
   }
   $s =~ s/ – Akimbo$//s;
   return $s;
}

sub sanitize_inline_deadline_label ($) {
   my($s) = @_;
   $s = sanitize_title($s);
   $s =~ s/\s*:\s*$//si;
   return $s;
}

sub findfirst_current ($) {
   my($s) = @_;
   my $it;
   for my $dir (split(/:/, $ENV{'PATH'})) {
      my $candidate = "$dir/$s";
      $it = $candidate if -f $candidate && -x $candidate;
   last if defined $it;
   }
   return $it;
}

sub get_feed () {
   my $cutoff = strftime '%Y%m%d%H%M', localtime(time - 3600);
   my $now = strftime '%Y%m%d%H%M', localtime;
   my $ok = 0;
   my $h = opendir(DIR, $listings_cache);
   die "$listings_cache: opendir: $!\n" unless defined $h;
   for (;;) {
      my $dirent = readdir DIR;
   last unless defined $dirent;
      $ok = 1 if -f "$listings_cache/$dirent" && $dirent =~ /^\d{12}$/ && $dirent > $cutoff;
      if ($ok && -z "$listings_cache/$dirent") {
	 warn "get_feed: warning: removing empty file $dirent\n";
	 unlink "$listings_cache/$dirent";
	 $ok = 0;
      }
   last if $ok;
   }
   closedir DIR;
   if ($ok) {
      print STDERR "get_feed: cache is still valid\n" if $debug_p;
   } else {
      print STDERR "get_feed: cache out of date, need to get new feed\n";
      # Real browsers
      my $w3m = findfirst_current 'w3m';
      # Page grabbing tools
      my $curl = findfirst_current 'curl';
      my $wget = findfirst_current 'wget';
      my @cmd = $w3m? (
	 $w3m,
	 '-I', 'UTF-8',
	 '-O', 'UTF-8',
	 '-dump_source',
	 $feed_url,
      ): $curl? (
	 $curl,
	 '-A', $ua_identity,
	 ($debug_p? '-v': ()),
	 $feed_url,
      ): (
	 'wget',
	 '-U', $ua_identity,
	 '-O', '-',
	 ($debug_p? '-d': ()),
	 $feed_url,
      );
      mkdir $listings_cache unless -d $listings_cache;
      if ($dry_run_p) {
	 print STDERR "Pretending to execute command: [(", join('\051 \050', @cmd), ")]\n";
      } else {
	 my $h = open(INPUT, '-|');
	 die "$cmd[0]: fork: $!\n" unless defined $h;
	 if (!$h) {
	    exec { $cmd[0] } @cmd;
	    die "$cmd[0]: exec: $!\n";
	 }
	 my $data;
	 for (;;) {
	    my $s = scalar <INPUT>;
	 last unless defined $s;
	    $data .= $s;
	 }
	 close INPUT;
	 if ($data =~ /\S/s) {
	    my $output = "$listings_cache/$now";
	    open(OUTPUT, '>', $output) or die "$output: $!\n";
	    print OUTPUT $data;
	    close OUTPUT;
	 }
      }
   }
}

sub parse_cached_post ($) {
   my($s0) = @_;
   my $it;
   my $state = 0;
   my $tmp = '';
   my @candidate_sections;
   my $candidate_subsection_title;
   for my $s (@$s0) {
      print STDERR "parse_cached_post: state=$state, s=($s)\n" if $debug_p > 1;
      if ($state == 0){
	 if ($s =~ /^\s*<link rel="canonical" href="(.*?)"\s*\/>\s*$/) {
	    $it->{'link'} = $1;
	 } elsif ($s =~ /<link rel=(['"])shortlink\1 href='.*?\?p=(\d+)'\s*\/>/) {
	    $it->{'p'} = $2;
	 } elsif ($s =~ /<title>((?:(?!<\/title>).)+)<\/title>/si) {	# title is usually h4, but can be h1. Save title just in case...
	    $it->{'title'} = sanitize_title($1);
	 } elsif ($s =~ /^\s*<div class=(['"])[^'']*\bcontent-block\b[^'']*\1>\s*$/) {
	    $state = 1;
	 } elsif ($s =~ /<script type=(['"])application\/ld\+json\1[^<>]*>.*?"datePublished":"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[-\+]\d{2}:\d{2})"/) {
	    $it->{'pubDate'} = $2;
	 } elsif ($s =~ /<article class="[^""]*\blisting_type-([^""\s]+)\b[^""]*">/) {
	    $it->{'type'} = $1;
	 }
      } elsif ($state == 1) {
	 if ($s =~ /^\s*<div class='[^'']*\bcontent-col-sidebar\b[^'']*'>\s*$/) {
	    $state = 2;
	 } else {
	    $tmp .= $s;
	    $tmp .= "\n";
	 }
      } elsif ($state == 2) {
	 if ($s =~ /<a href='.*?fwp_sublisting_tags=(\w+)'[^<>]*>([^<>]+)<\/a>/) {
	    push @{$it->{'category'}}, $2;
	 }
      }
   last if /<h5[^<>]*>Related Content<\/h5>/;
   }
   $tmp =~ s/\s*<\/div>\s*<\/div>\s*$//s;
   if ($tmp =~ /^\s*<p><img\b[^<>]*src="([^""]+)"[^<>]*><\/p>\s*/s) {
      $it->{'img'} = $1;
      $tmp = $';
   }
   while ($tmp =~ /<(\w+)[^<>]*?class="text-deadline"[^<>]*>([^<>]+)<\/\1>/sg) {
      my $decoded_deadline = decode_date($2, $it->{'pubDate'});
      $it->{'deadline'}->{$decoded_deadline} = {'label' => $`, 'date' => $decoded_deadline} if defined $decoded_deadline;
   }
   # Look for possible subevent then marshall deadline hash back into string form so that we don't break things =P
   for my $i (keys %{$it->{'deadline'}}) {
      if ($it->{'deadline'}->{$i}->{'label'} =~ /<p\b[^<>]*>(?:<strong>)?Deadline\s+for\s+((?:(?!<\/p>).)+)(?:(?!<[^<>]*\bclass="text-deadline"[^<>]*>|<p\b[^<>]*>).)*$/si) {
	 $it->{'deadline'}->{$i}->{'label'} = sanitize_inline_deadline_label($1);
      } elsif ($it->{'deadline'}->{$i}->{'label'} =~ /<h4[^<>]*>((?:(?!<\/h4>).)+)<\/h4>(?:(?!<[^<>]*\bclass="text-deadline"[^<>]*>|<h4>).)*$/si) {
	 $it->{'deadline'}->{$i}->{'label'} = sanitize_title($1);
      } else {
	 delete $it->{'deadline'}->{$i}->{'label'};
      }
   }
   # Look at the deadlines again. Delete all labels if we have just 1 deadline or if not one all deadlines have labels
   if (keys %{$it->{'deadline'}} == 1 || join('', map { defined $it->{'deadline'}->{$_}->{'label'}? '': '1' } keys %{$it->{'deadline'}}) =~ /1/) {
      for my $i (keys %{$it->{'deadline'}}) {
	 delete $it->{'deadline'}->{$i}->{'label'};
      }
   }
   # Marshall deadline hash back into string form so that we don't break things =P
   for my $i (keys %{$it->{'deadline'}}) {
      if ($it->{'deadline'}->{$i}->{'label'} =~ /^(.+)$/s) {
	 $it->{'deadline'}->{$i} = sprintf("%s\t%s", $it->{'deadline'}->{$i}->{'date'}, sanitize_title($1));
      } else {
	 $it->{'deadline'}->{$i} = $it->{'deadline'}->{$i}->{'date'};
      }
   }
   $it->{'deadline'} = [sort values %{$it->{'deadline'}}];
   $it->{'content:encoded'} = decode_akimbo_rss_encoded_text($tmp);
   return $it;
}

sub is_gzip ($) {
   my($path) = @_;
   my $it;
   local(*INPUT, $_, $.);
   my @cmd = ('/usr/bin/file', $path);
   my $h = open(INPUT, '-|');
   die "$cmd[0]: fork: $!\n" unless defined $h;
   if (!$h) {
      exec { $cmd[0] } @cmd;
      die "$cmd[0]: exec: $!\n";
   } else {
      my $det = scalar <INPUT>;
      close INPUT;
      $it = $det =~ /gzip/;
   }
   return $it;
}

sub get_file_contents ($) {
   my($path) = @_;
   local(*INPUT, $_, $.);
   my $h;
   my $it;
   if (is_gzip $path) {
      my @cmd = ('/bin/zcat', $path);
      $h = open(INPUT, '-|');
      die "$cmd[0]: fork: $!\n" unless defined $h;
      if (!$h) {
	 exec { $cmd[0] } @cmd;
	 die "$cmd[0]: exec: $!\n";
      }
      print STDERR "get_file_contents: gzip file \"$path\" opened\n" if $debug_p;
   } else {
      $h = open(INPUT, '<', $path);
      die "$path: open: $!\n" unless defined $h;
      print STDERR "get_file_contents: normal file \"$path\" opened\n" if $debug_p;
   }
   for (;;) {
      my $s = scalar <INPUT>;
   last unless defined $s;
      chomp $s;
      $it = [] unless defined $s;
      push @$it, $s;
      print STDERR "get_file_contents: s=($s)\n" if $debug_p > 1;
   }
   close INPUT;
   return $it;
}

sub get_cached_posts () {
   my $posts = {};
   my $h = opendir(DIR, $posts_cache);
   if (!defined $h) {
      print STDERR "warning: $posts_cache: opendir: $!\n" if $debug_p;
      mkdir $posts_cache;
   } else {
      for (;;) {
	 my $dirent = readdir DIR;
      last unless defined $dirent;
	 if (-f "$posts_cache/$dirent" && $dirent !~ /\.url$/) {
	    my $config = read_ini_file "$posts_cache/$dirent.url";
	    my $article = parse_cached_post get_file_contents "$posts_cache/$dirent";
	    $posts->{$article->{'link'}} = $article;
	    if ($config && defined $config->{''}->{'link'}) {
	       # Sometimes the link (and title) in the index doesn't match the same in the actual post. This avoids duplicate wgets
	       printf STDERR "get_cached_posts: found alias link(s): %s\n", join(' ', @{$config->{''}->{'link'}}) if $debug_p;
	       for my $link (@{$config->{''}->{'link'}}) {
		  if (!defined $posts->{$link}) {
		     $posts->{$link} = $article;
		  } else {
		     warn "get_cached_posts: warning: $dirent.url: Link $link already defined (ignored)\n";
		  }
	       }
	    }
	    printf STDERR "get_cached_posts: read %s -> %s from %s\n", $dirent, $article->{'link'} if $debug_p;
	 } else {
	    printf STDERR "get_cached_posts: ignoring %s\n", $dirent if $debug_p;
	 }
      }
      closedir DIR;
   }
   return $posts;
}

sub get_post ($) {
   my($link) = @_;
   my $now = strftime '%Y%m%d%H%M', localtime;
   my $ok = 0;
   my $tmpname = "$posts_cache/$$.tmp";
   my @cmd = (
      'wget',
      '-U', $ua_identity,
      '-O', $tmpname,
      ($debug_p? '-d': ()),
      $link,
   );
   mkdir $posts_cache unless -d $posts_cache;
   if ($dry_run_p) {
      print STDERR "Pretending to execute command: [(", join('\051 \050', @cmd), ")]\n";
   } else {
      my $h = fork;
      die "$cmd[0]: fork: $!\n" unless defined $h;
      if (!$h) {
	 exec { $cmd[0] } @cmd;
	 die "$cmd[0]: exec: $!\n";
      }
      my $st = wait;
      print STDERR "get_feed: command: [(", join('\051 \050', @cmd), ")] returned $st\n" if $debug_p;
   }
   my $article = parse_cached_post get_file_contents "$posts_cache/$$.tmp";
   if ($article && %$article && defined $article->{'p'}) {
      my $newname = sprintf('%s/%s', $posts_cache, $article->{'p'});
      print STDERR "Saving post as $newname...\n"; # XYZZY
      rename $tmpname, $newname || warn "Cannot rename $tmpname to $newname: $!\n";

      # Sometimes the link (and title) in the index doesn't match the same in the actual post. This avoids duplicate wgets
      if ($link ne $article->{'link'}) {
	 open(OUTPUT, '>', $tmpname) || die "$tmpname: creat: $!\n";
	 print OUTPUT "link=$link\n";
	 close OUTPUT;
	 rename $tmpname, "$newname.url" || warn "Cannot rename $tmpname to $newname.url: $!\n";
      }
   } else {
      die "Something is wrong...page ID not detected in page, please look at $tmpname to debug\n";
   }
   return $article;
}

sub convert_to_plain_text ($;$) {
   my($s, $w) = @_;
   $w = 79 unless defined $w;
   my %links;
   my $hr = '-' x $w;
   $s =~ s/<br\s*\/?>(?:<br\s*\/?>)*/\n/sgi;
   $s =~ s/<hr\s*\/?>(?:<br\s*\/?>)*/$hr\n/sgi;
   $s =~ s/<\/p>\s*<p\b[^<>]*>/\n  /sgi;
   $s =~ s/<(?:\/?p\b[^<>]*)>/\n/sgi;
   $s =~ s/<\/?(?:em|span|strong)\b[^<>]*>//sgi;
   $s =~ s/<img\b[^<>]*alt="([^"]+)"[^<>]*>/\1/sgi;
   $s =~ s/<img\b[^<>]*>//sgi;
   $s =~ s{<h4\b[^<>]*>((?:(?!<\/h4>).)+)<\/h4>}{
	    "\n" . $1 . "\n" . ('-' x length $1);
	 }sgie;
   $s =~ s{<a\b[^<>]*?href="([^"]+)"[^<>]*>((?:(?!<\/a>).)+)<\/a>}{
	    $links{$1} = $2 unless $1 eq $2 || $1 eq "mailto:$2" || $1 eq "http://$2";
	    $2;
	 }sgie;
   $s =~ s/^\s*//s;
   $s =~ s/\s*$//s;
   $s =~ s/\n{3,}/\n\n/sg;
   if (%links) {
      $s .= "\n\nLinks:";
      $s .= join('', map { sprintf("\n%s: %s", $links{$_}, $_) } keys %links);
   }
   return $s;
}

sub decode_akimbo_rss_encoded_text ($) {
   my($s) = @_;
   local($_, $`, $&, $', $1, $2, $3);
   $s =~ s/&#(\d+);/ chr($1); /sge;
   $s =~ s/&amp;/\&/sg;
   $s = $1 if $s =~ /^<!\[CDATA\[(.*?)\]\]>$/s;
   return $s;
}

sub decode_date ($;$) {
   my($s0, $pubdate) = @_;
   my $default_year = $1 if defined $pubdate && $pubdate =~ /^(\d{4})-/;
   my $tz;
   my $dow;
   my $tbd;	# FIXME NOT USED
   my $time;
   my($year, $month, $day);
   my $sp = "(?:\\s| )";
   my $dash = '(?:\s*-\s*|\s*–\s*|\s*–\s*|\s*\&#8211;\s*|\s+and\s+)';
   my $s = $s0;
   my $not_a_date_p = $s =~ /^(?:by email only)$/i; # No, I'm not kidding you
   if ($not_a_date_p) {
      $s = undef;
   } else {
      local($', $&, $', $1, $2);
      $s =~ s/^(?:Application|Submission)s? (?:deadline is|must be received by) //;
      $s =~ s/^CV’s reviewed starting //;
      if ($s =~ /^(?:Ongoing(?: application cycle)?|applications accepted on an ongoing basis\.)$/i) { # XXX sigh...
	 $s = 'Ongoing';
      } elsif ($s =~ /^(?:Open until filled|Until position is filled)$/i) { # XXX sigh...
	 $s = 'Until filled';
      } elsif ($s !~ /^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d[-\+]\d\d:\d\d$/) {
	 # NOTE: If you add a new timezone, you need to modify determine_urgency
	 if ($s =~ /\s+A[S]?T\b|\.?\s*\(A[S]?T\)|\s+\(Atlantic\)|\s+Atlantic\s+time\b/) {
	    ($s, $tz) = ("$`$'", 'AT');
	 } elsif ($s =~ /\s+E[DS]?T\b|\.?\s*\((?:E[DS]?T|HNE)\)|\s+\(Eastern\)|\s+Eastern\s+Time\b/i) {
	    ($s, $tz) = ("$`$'", 'ET');
	 } elsif ($s =~ /\s+M[DS]?T\b|\.?\s*\(M[DS]?T\)|\s+\(Mountain\)|\s+Mountain\s+Time\b/i) {
	    ($s, $tz) = ("$`$'", 'MT');
	 } elsif ($s =~ /\s+N[DS]?T\b|\.?\s*\(N[DS]?T\)|\s+\(Newfoundland(?:(?:\s+Daylight)?\s+Time)?\)|\s+Newfoundland\s+Time\b/i) {
	    ($s, $tz) = ("$`$'", 'NT');
	 } elsif ($s =~ /\s+P[DS]?T\b|\.?\s*\(P[DS]?T\)|\s+\(Pacific\)|\s+Pacific\s+Time\b/i) {
	    ($s, $tz) = ("$`$'", 'PT');
	 } elsif ($s =~ /\s+Y[S]?T\b|\.?\s*\(Y[S]?T\)|\s+\(Yukon\)|\s+Yukon\s+Time\b/i) {
	    ($s, $tz) = ("$`$'", 'YT');
	 }
	 if ($s =~ /\(TBD\)/) {
	    ($s, $tbd) = ("$`$'", 1);
	 }
	 if ($s =~ /\bSun(?:day)?,?\s+/i) {
	    ($s, $dow) = ("$`$'", 'U');
	 } elsif ($s =~ /\bMon(?:day)?,?\s+/i) {
	    ($s, $dow) = ("$`$'", 'M');
	 } elsif ($s =~ /\bTue(?:s(?:day)?)?,?\s+/i) {
	    ($s, $dow) = ("$`$'", 'T');
	 } elsif ($s =~ /\bWed(?:nes(?:day)?)?,?\s+/i) {
	    ($s, $dow) = ("$`$'", 'W');
	 } elsif ($s =~ /\bThu(?:r(?:s(?:day)?)?)?,?\s+/) {
	    ($s, $dow) = ("$`$'", 'R');
	 } elsif ($s =~ /\bFri(?:day)?,?\s+/i) {
	    ($s, $dow) = ("$`$'", 'M');
	 } elsif ($s =~ /\bSat(?:ur(?:day)?)?,?\s+/i) {
	    ($s, $dow) = ("$`$'", 'S');
	 }
	 $s =~ s/(?:^|\.?\s+)(?:(?:@|at|by)\s+|between\s+\d{1,2}(?::\d\d)?\s*(?:[ap]\.m\.|[AP]M)?$dash)(?=\d)/ /i;
	 $s =~ s/\s*\.\s*$//;
	 if ($s =~ /^(?:[A-Z][a-z]+$sp+\d{1,2},$sp+\d{1,2}(?::\d\d)?(?:$sp*[ap]\.?m\.?)?)$sp+$dash$sp+([A-Z][a-z]+$sp+\d{1,2},$sp+\d{1,2}(?::\d\d)?(?:$sp*[ap]\.?m\.?)?)$/) { # two date/time ranges?
	    $s = $1;
	 }
	 my $sp_optional_dash = "(?:$sp+|$sp*\@|$sp+&#8211;$sp+)";
	 if ($s =~ /$sp_optional_dash(\d+)(?::(\d\d))?$sp*(?:a\.m|AM)\b/i) {
	    ($s, $time) = ("$`$'", sprintf('%02d:%02d', $1 != 12 && $1, $2));
	 } elsif ($s =~ /$sp_optional_dash(\d+)(?::(\d\d))?$sp*(?:p\.m|PM)\b/i) {
	    ($s, $time) = ("$`$'", sprintf('%02d:%02d', ($1 % 12) + 12, $2));
	 } elsif ($s =~ /$sp*\((\d+)(?::(\d\d))?$sp*(?:p\.m|PM)\)/i) {
	    ($s, $time) = ("$`$'", sprintf('%02d:%02d', ($1 % 12) + 12, $2));
	 } elsif ($s =~ /^(\d+)(?::(\d\d)(?::(\d\d))?)?$sp*(?:a\.m\.|AM)(?:,\s+|\s+(?:at|by|on)\s+)?/i) {
	    ($s, $time) = ($', sprintf('%02d:%02d', $1 != 12 && $1, $2));
	    $time .= sprintf(':%02d', $3) if defined $3;
	 } elsif ($s =~ /^(\d+)(?::(\d\d)(?::(\d\d))?)?$sp*(?:p\.m\.|PM)(?:,\s+|\s+(?:at|by|on)\s+)?/i) {
	    ($s, $time) = ($', sprintf('%02d:%02d', ($1 % 12) + 12, $2));
	    $time .= sprintf(':%02d', $3) if defined $3;
	 } elsif ($s =~ /(?:\s+(?:@|at|by)|,)\s+(?:12\s+)?noon\b/i || $s =~ /^(?:12\s+)?[Nn]oon\s+/i) {
	    ($s, $time) = ("$`$'", '12:00');
	 } elsif ($s =~ /(?:\s+(?:@|at|by)|,)\s+(?:12\s+)?midnight\b/i || $s =~ /^(?:12\s+)?[Mm]idnight\s+/i) {
	    ($s, $time) = ("$`$'", '00:00');
	 }
	 $s =~ s/^\.\s*//;
	 $s =~ s/^(?:by|on)\s+//;
	 $s =~ s/\s*[\.,:\@]\s*$//;
	 $s =~ s/\s+(?:to):?$//;
	 $s =~ s/\s+$//;
	 if ($s =~ /^(?:le\s+)?(\d{1,2})(?:st|nd|rd|th|er|e)?$sp+([A-Z][A-Za-zéû]+\.?)[,\/]?$sp+(\d{4})$/i) {
	    ($year, $month, $day) = ($3, $2, $1);
	 } elsif ($s =~ /^([A-Z][A-Za-z]+\.?)$sp+(\d{1,2})(?:st|nd|rd|th)?[,\/]?$sp+(\d{4})$/i) {
	    ($year, $month, $day) = ($3, $1, $2);
	 } elsif ($s =~ /^(\d{4})$sp+([A-Z][A-Za-z]+\.?)$sp+(\d{1,2})(?:st|nd|rd|th)?$/i) {
	    ($year, $month, $day) = ($1, $2, $3);
	 } elsif ($s =~ /^(\d{1,2})(?:st|nd|rd|th)?$sp+([A-Z][A-Za-z]+\.?)$/i && defined $default_year) {
	    ($year, $month, $day) = ($default_year, $2, $1);
	 } elsif ($s =~ /^([A-Z][A-Za-z]+\.?)$sp+(\d{1,2})(?:st|nd|rd|th)?$/i && defined $default_year) {
	    ($year, $month, $day) = ($default_year, $1, $2);
	 } elsif ($s =~ /^(\d{1,2})$sp+([a-zéû]+\.?)$sp+(\d{4})(?:(?:$sp+$dash)?$sp+(\d{1,2})$sp*h(?:$sp*(\d{2}))?)?$/i) { # French
	    ($year, $month, $day, $time) = ($3, $2, $1, ($4? sprintf('%02d:%02d', $4, $5): $time));
	 }
	 $month = 1 if $month =~ /^(?:Jan(?:uary|\.)?|janvier)$/i;
	 $month = 2 if $month =~ /^(?:Feb(?:ruary|\.)?|février)$/i;
	 $month = 3 if $month =~ /^(?:Mar(?:ch|\.)?|mars)$/i;
	 $month = 4 if $month =~ /^(?:Apr(?:il|\.)?|avril)$/i;
	 $month = 5 if $month =~ /^(?:May|mai)$/i;
	 $month = 6 if $month =~ /^(?:Jun(?:e|\.)?|juin)$/i;
	 $month = 7 if $month =~ /^(?:Jul(?:y|\.)?|juillet)$/i;
	 $month = 8 if $month =~ /^(?:Aug(?:ust|\.)?|août)$/i;
	 $month = 9 if $month =~ /^(?:Sep(?:t(?:ember|\.)?|\.)?|septembre)$/i;
	 $month = 10 if $month =~ /^(?:Oct(?:ober|\.)?|octobre)$/i;
	 $month = 11 if $month =~ /^(?:Nov(?:ember|\.)?|novembre)$/i;
	 $month = 12 if $month =~ /^(?:Dec(?:ember|\.)?|décembre)$/i;
	 warn "Warning: Cannot parse date \"$s0\" (left: \"$s\"; default_year=$default_year)\n" unless defined $year;
	 $s = sprintf('%04d-%02d-%02d', $year, $month, $day) if defined $year;
	 $s .= "T$time" if defined $time;
	 #$s .= " ($dow)" if defined $dow;
	 $s .= " ($tz)" if defined $tz;
      }
   }
   return $s;
}

sub break_data_into_articles ($) {
   my($data) = @_;
   my $it;
   if ($data->[0] =~ /<\?xml/) {
      # RSS - In theory this is all good, but Akimbo doesn't encode real categories (e.g., calls) in the RSS, so it's of limited use
      print STDERR "break_data_into_articles: analyzing XML index...\n" if $debug_p > 1;
      for my $s (split(/\s*<item>(?:\s*<\/item>\s*)?/s, join("\n", @$data))) {
	 $s =~ s/^.*<channel>.*<item>$//s;			# first item
	 $s =~ s/<\/item>\s*<\/channel>\s*<\/rss>\s*$//s;	# last item
	 $s =~ s/^\s*//s;
	 $s =~ s/\s*$//s;
	 if ($s =~ /<guid\b[^<>]*>(.*?)<\/guid>/s) {
	    my $node = {'guid' => decode_akimbo_rss_encoded_text $1};
	    for my $field ('title', 'link', 'pubDate', 'dc:creator', 'description', 'content:encoded') {
	       $node->{$field} = decode_akimbo_rss_encoded_text $1 if $s =~ /<$field\b[^<>]*>(.*?)<\/$field>/s;
	    }
	    for my $field ('category') {
	       push @{$node->{$field}}, decode_akimbo_rss_encoded_text $1 while $s =~ /<$field\b[^<>]*>(.*?)<\/$field>/sg;
	    }
	    while ($node->{'content:encoded'} =~ /<(\w+)[^<>]*?class="text-deadline"[^<>]*>([^<>]+)<\/\1>/sg) {
	       my $decoded_deadline = decode_date($2, $node->{'pubDate'});
	       $node->{'deadline'}->{$2, $node->{'pubDate'}}->{'date'} = $decoded_deadline if defined $decoded_deadline;
	    }
	    $node->{'deadline'} = [sort values %{$node->{'deadline'}->{'date'}}];
	    delete $node->{'deadline'} unless @{$node->{'deadline'}};
	    print STDERR "DEBUG: ($s)\n" if $debug_p > 2;
	    print STDERR Dumper($node) if $debug_p > 1;
	    push @$it, $node;
	 }
      }
   } else {
      # The assumption here is this is from an actual listing page
      print STDERR "break_data_into_articles: analyzing HTML index...\n" if $debug_p > 1;
      # Calls only: https://akimbo.ca/listings/?fwp_listing_type=calls
      my $state = 0;
      my $node;
      my $insert_node = sub {
	 print STDERR Dumper($node) if $debug_p > 1;
	 push @$it, $node;
	 $node = undef;
      };
      for my $s (@$data) {
	 print STDERR "break_data_into_articles: state=$state, s=($s)\n" if $debug_p > 1;
	 if ($state == 0) {
	    if ($s =~ /<div class=(['"])(?:(?!\1).)*\bcard-body\b(?:(?!\1).)*\1>/) {
	       $state = 1;
	    }
	 } elsif ($state == 1) {
	    if ($s =~ /<a href='.*?fwp_listing_type=(\w+)'[^<>]*>([^<>]+)<\/a>/) {
	       # FIXME This should apparently be multi-valued because I got a type = Exhibitions from the Calls for Submissions listing (!)
	       print STDERR "break_data_into_articles: got type\n" if $debug_p > 2;
	       $node->{'type'} = $1;
	    } elsif ($s =~ /<time\b[^<>]*?\bdatetime='([^'']+)'>([^<>]+)<\/time>/) {
	       print STDERR "break_data_into_articles: got pubDate\n" if $debug_p > 2;
	       $node->{'pubDate'} = decode_date $1;
	    } elsif ($s =~ /<h3[^<>]+><a href='([^'']+)'[^<>]*>([^<>]+)<\/a><\/h3>/) {
	       print STDERR "break_data_into_articles: got link, title\n" if $debug_p > 2;
	       $node->{'link'} = $1;
	       $node->{'title'} = decode_akimbo_rss_encoded_text $2;
	       $state = 4;
	    }
	 } elsif ($state == 4) {
	    if ($s =~ /^\s*<div[^<>]*>Venue<\/div>\s*$/) {
	       $state = 5;
	    } elsif ($s =~ /<time class='timeago'/) { # XXX posting publication date - sorry, this is really a hack
	       &$insert_node if defined $node;
	       $state = 1;
	    }
	 } elsif ($state == 5) {
	    if ($s =~ /^\s*<div[^<>]*><a href='([^'']+)'[^<>]*>(.*?)<\/a><\/div>\s*$/) {
	       print STDERR "break_data_into_articles: got venue-link, venue\n" if $debug_p > 2;
	       $node->{'venue-link'} = $1;
	       push @{$node->{'venue'}}, $2;
	    } elsif ($s =~ /^\s*<div[^<>]*>(.*?)<\/div>\s*$/) {
	       print STDERR "break_data_into_articles: got venue\n" if $debug_p > 2;
	       push @{$node->{'venue'}}, $1;
	    } elsif ($s =~ /^\s*<\/div>\s*$/) {
	       $state = 4;
	    }
	 }
	 if ($s =~ /<div class='card'>/) {
	    &$insert_node if defined $node;
	    $state = 0;
	 }
      }
      &$insert_node if defined $node;
   }
   return $it;
}

sub merge_data () {

   # Read cached posts
   my $posts = get_cached_posts;
   my $articles;

   # Read cached index pages into articles list
   my $h = opendir(DIR, $listings_cache);
   die "$listings_cache: opendir: $!\n" unless defined $h;
   for (;;) {
      my $dirent = readdir DIR;
   last unless defined $dirent;
      my $input = "$listings_cache/$dirent";
      if (-f $input) {
	 if (-z $input) {
	    print STDERR "merge_data: $input: removing empty file\n" if $debug_p;
	    unlink $input;
	 } else {
	    my $data = get_file_contents "$listings_cache/$dirent";
	    if (defined $data) {
	       for my $article (@{ break_data_into_articles $data }) {
		  $articles->{$article->{'link'}} = $article;
	       }
	    }
	 }
      }
   }

   # Checked if any posts in index pages aren't yet cached
   my @need_to_fetch;
   for my $link (keys %$articles) {
      if (!$link) {
	 print STDERR "merge_data: article has no link: no link ", Dumper $articles->{$link} if $debug_p;
      } elsif (defined $posts->{$link}) {
	 print STDERR "merge_data: article already in posts database: $link\n" if $debug_p;
      } else {
	 print STDERR "Need to fetch $link", (defined $articles->{$link}->{'p'}? sprintf(' as %s', $articles->{$link}->{'p'}): ''), "...\n";
	 push @need_to_fetch, $link;
      }
   }
   closedir DIR;
   printf "Need to fetch %d post(s):\n", scalar @need_to_fetch if @need_to_fetch;

   # Fetch all posts that need to be fetched
   for my $link (@need_to_fetch) {
      insert_random_delay;
      my $article = get_post $link;
      $posts->{$article->{'link'}} = $article;
   }

   # Merge posts into articles list
   for my $link (keys %$posts) {
      for my $key (keys %{$posts->{$link}}) {
	 $articles->{$link}->{$key} = $posts->{$link}->{$key};
      }
   }

   # Go through the articles and check for multiple deadlines; if it looks like we know what the subevents are split the article =P
   if (!$disable_multi_deadline_hack_p) {
      my $new_articles = {};
      for my $i (keys %$articles) {
	 my $article = $articles->{$i};
	 my $deadlines = $article->{'deadline'};
	 if (join('', @$deadlines) =~ /\t/s) {
	    my $new_article = {};
	    my $j = 0;
	    for my $deadline (@$deadlines) {
	       my($date, $label) = ($1, $2) if $deadline =~ /^(.*?)\t(.*)$/;
	       my $k = sprintf('%s\t#%s', $i, $date);
	       for my $h (keys %$article) {
		  if ($h eq 'p') {		# article ID
		     $new_article->{$h} = sprintf('%s%s', $article->{$h}, chr(97 + $j));
		  } elsif ($h eq 'title') {
		     $new_article->{$h} = sprintf('%s – %s', sanitize_title($article->{$h}, $deadlines), $label);
		  } elsif ($h eq 'deadline') {
		     $new_article->{$h} = [$date];
		  } elsif ($h eq 'link') {	# link is used as a unique key so we must artificially make split keys
		     $new_article->{$h} = sprintf('%s#%s', $article->{$h}, chr(97 + $j));
		  } else {
		     $new_article->{$h} = $article->{$h};
		  }
	       }
	       $new_articles->{$k} = {%$new_article}; # NOTE must dereference then construct, otherwise we reuse the same object -> chaos
	       $j += 1;
	    }
	 } else {
	    $new_articles->{$i} = $article;
	 }
      }
      $articles = $new_articles;
   }

   # Do a couple of things on the deadlines
   for my $link (keys %$articles) {
      my $article = $articles->{$link};
      my %deadlines = map { $_ => $_ } @{$article->{'deadline'}};

      # If we find one deadline with a time zone code and another one without, remove the one without
      for my $deadline (keys %deadlines) {
	 delete $deadlines{$`} if $deadline =~ / \([A-Z]+\)$/ && defined $deadlines{$`};
      }
      $articles->{$link}->{'deadline'} = [sort keys %deadlines];

      # If we find one deadline with a time and another one without, remove the one without
      for my $deadline (keys %deadlines) {
	 delete $deadlines{$`} if $deadline =~ /T\d\d:\d\d/ && defined $deadlines{$`};
      }
      $articles->{$link}->{'deadline'} = [sort keys %deadlines];

      # For each set of deadlines, calculate an "effective deadline", for sorting
      $article->{'effective-deadline'} = !%deadlines? 'N/A': $articles->{$link}->{'deadline'}->[0];
   }
   return $articles;
}

sub determine_urgency ($) {
   my($deadlines) = @_;
   my $t0 = time;
   my $urgency;
   for my $deadline (@$deadlines) {
      if ($deadline =~ /^(\d\d\d\d)-(\d\d)-(\d\d)(?:T(\d\d):(\d\d)(?::(\d\d))?)?(?: \(([A-Z]+)\))?$/) {
	 my($year, $month, $day, $hour, $minute, $second, $tz) = ($1, $2, $3, $4, $5, $6, $7);
	 my $t = mktime($second + 0, $minute + 0, $hour + 0, $day, $month - 1, $year - 1900);
	 # XXX Adjust for time zone differences =P
	 if (defined $tz) {
	    $t -= 10800 if $tz eq 'PT';
	    $t -=  7200 if $tz eq 'MT';
	    $t -=  7200 if $tz eq 'YT';	# FIXME this is wrong because YT has no DST
	    $t -=  3600 if $tz eq 'CT';
	    $t +=  3600 if $tz eq 'AT';	# FIXME this is wrong because AT has no DST
	    $t +=  5400 if $tz eq 'NT';
	 }
	 no integer;
	 $urgency = ($t - $t0)/86400;
      } elsif ($deadline =~ /^Ongoing$/) { # XXX sigh
	 $urgency = "1 (Ongoing)";
      } elsif ($deadline =~ /^(?:Open until filled|Until position is filled)$/) { # XXX sigh
	 $urgency = "1 (Ongoing)";
      }
   last if defined $urgency && $urgency >= 0;
   }
   return $urgency;
}

sub format_deadline ($) {
   my($deadline) = @_;
   my $fmt = '%4s-%2s-%2s %2s%s%2s %-2s';
   if ($deadline =~ /^(\d\d\d\d)-(\d\d)-(\d\d)(?:T(\d\d):(\d\d)(?::(\d\d))?)?(?: \(([A-Z]+)\))?$/) {
      my($year, $month, $day, $hour, $minute, $second, $tz) = ($1, $2, $3, $4, $5, $6, $7);
      $deadline = sprintf($fmt, $year, $month, $day, (defined $hour? ($hour, ':', $minute): ('', '', '')), (defined $tz? $tz: ''));
   }
   $deadline = $` if $disable_multi_deadline_hack_p && $deadline =~ /\t/; # remove marshalled label
   return $deadline;
}

sub shorten_title ($) {
   my($title) = @_;
   if ($title =~ /^(?:Call\s+(?:for|to)\s+(?:Applicants|Applications|Artists|Artwork|Submissions):\s+)(.*)$/i) {
      $title = $1;
   } elsif ($title =~ /^([^:]+)\s+is\s+hiring\s+[\w\s]+\s+positions?:\s+(.*)$/i) {
      $title = $1;
   }
   return $title;
}

sub display_post ($$) {
   my($p, $articles) = @_;
   my $article;
   for my $candidate (values %$articles) {
      $article = $candidate if defined $candidate->{'p'} && $candidate->{'p'} == $p;
   }
   die "No post numbered $p\n" unless defined $article;
   my $w = length $article->{'title'};
   printf "\n";
   printf "%s\n", $article->{'title'};
   printf "%s\n", ('=' x $w);
   printf "\n%s\n", convert_to_plain_text($article->{'content:encoded'}, $w);
   printf "\n";
   printf "%s\n", join(' ', map { "#$_" } @{$article->{'category'}});
   printf "%s\n", $article->{'link'};
}

sub display_listing ($) {
   my($articles) = @_;
   my $config = read_ini_file $rc_path;
   my $fmt = "%-5.5s  %-21s  %-8s  %-6s  %s\n";
   my %seen;
   my @output;
   # Create the output but don't show it yet
   for my $i (sort { $articles->{$a}->{'effective-deadline'} cmp $articles->{$b}->{'effective-deadline'} } keys %$articles) {
      my $article = $articles->{$i};
      my $deadlines = $article->{'deadline'};
      my $p = $article->{'p'};
      if ((($deadlines && @$deadlines) && (!defined $config->{$p} || !defined $config->{$p}->{'ignore'} || $almost_all_p)) || $all_p) {
	 my $urgency = determine_urgency $deadlines;
	 my $display_urgency;;
	 no integer;
	 if ($urgency < 0) {
	    $display_urgency = '(past)';
	 } elsif ($urgency < 1) {
	    $display_urgency = 'TODAY';
	 } elsif ($urgency < 2) {
	    $display_urgency = 'tomorrow';
	 } elsif ($urgency < 14) {
	    $display_urgency = sprintf('%d day%s', $urgency, $urgency == 1? '': 's');
	 } else {
	    $display_urgency = sprintf('%d week%s', $urgency/7, $urgency/7 == 1? '': 's');
	 }
	 if (($urgency > -3 || $almost_all_p || $all_p) && !defined $seen{$article->{'p'}}) {
	    push @output, [$urgency, sprintf $fmt, $article->{'type'}, format_deadline $article->{'effective-deadline'}, $display_urgency, $article->{'p'}, shorten_title $article->{'title'}];
	    if ($deadlines && @$deadlines > 1) {
	       for (my $i = 1; $i < @$deadlines; $i += 1) {
		  $output[$#output]->[1] .= sprintf $fmt, '', format_deadline $deadlines->[$i], '', '', '';
	       }
	    }
	    $seen{$article->{'p'}} = 1 if $article->{'p'};
	 }
      }
   }
   # If any output has actually been created, display it
   if (@output) {
      printf $fmt, 'Type', 'Deadline', 'Urgency', 'ID', 'Title';
      for my $s (sort { $a->[0] <=> $b->[0] } @output) {
	 print $s->[1];
      }
   }
}

Getopt::Long::Configure ("no_auto_abbrev");
GetOptions(
   'all|a' => \$all_p,
   'almost-all|A' => \$almost_all_p,
   'debug|d' => sub { $debug_p += 1; },
   'disable-multi-deadline-hack' => \$disable_multi_deadline_hack_p,
   'dry-run|n' => \$dry_run_p,
) || exit(1);

fix_cache_paths;
get_feed;
my $data = merge_data;
if (@ARGV) {
   for my $id (@ARGV) {
      display_post $id, $data;
   }
} else {
   display_listing $data;
}