#!/usr/bin/perl

# locale-gen
#
# Generates a glibc locale archive from templates, potentially limiting itself
# to a set of locales defined by the admin, typically within /etc/locale.gen.

use v5.36;

use Cwd qw(getcwd);
use Errno qw(ENOENT);
use Fcntl qw(SEEK_SET);
use File::Spec::Functions qw(canonpath catfile catdir splitpath);
use File::Temp qw(tempdir);
use Getopt::Long ();

# Formally stable as of v5.40; sufficiently functional in both v5.36 and v5.38.
use experimental qw(try);

# Determine the basename of the presently compiling script.
my $PROGRAM;
BEGIN { $PROGRAM = (splitpath(__FILE__))[-1]; }

my $VERSION = '3.4';

my $DEFERRED_SIGNAL = '';
my $PID = $$;
my $TEMPDIR;

# Unset BASH_ENV for security reasons. Even as sh(1), bash acts upon it.
delete $ENV{'BASH_ENV'};

# Protect against the inheritance of an unduly restrictive umask.
umask 0022;

{
	# Determine the locale directory, as reported by localedef(1).
	my $locale_dir = get_locale_dir();

	# Infer the path of a Gentoo Prefix environment, if any.
	my $gentoo_prefix = '';
	if (defined $locale_dir) {
		$gentoo_prefix = detect_gentoo_prefix($locale_dir);
		if (length $gentoo_prefix) {
			$locale_dir =~ s/^\Q$gentoo_prefix//;
		}
	}

	# Collect any supported options and option-arguments.
	my %opt = parse_opts($gentoo_prefix, @ARGV);
	my $prefix = $opt{'prefix'} // $gentoo_prefix;

	# Ensure that locale/charmap files are opened relative to the prefix.
	# It is especially important to do so in situations where the value of
	# $prefix is neither <slash> nor an empty string.
	$ENV{'I18NPATH'} = catdir($prefix, '/usr/share/i18n');

	# For the directory to be unknown strongly implies the absence of glibc.
	if (! defined $locale_dir) {
		die "$PROGRAM: Aborting because the OS does not appear to use GNU libc\n";
	}

	# Honour the --quiet option.
	if ($opt{'quiet'} && ! open *STDOUT, '>/dev/null') {
		die "Can't direct STDOUT to /dev/null: $!";
	}

	# Ensure that the C.UTF-8 locale is made available.
	my @locales = ([ 'C', 'UTF-8', 'C.UTF-8' ]);

	# Compose a list of up to two configuration files to be read.
	my @config_files = select_config_files($prefix, %opt);

	# Collect the locales that are being requested for installation.
	push @locales, read_config($prefix, @config_files);

	# Compose a dictionary of installed locales for the --update option.
	my %installed_by;
	if ($opt{'update'}) {
		# If localedef(1) originates from a Gentoo Prefix environment,
		# the prefix will already have been hard-coded by the utility.
		my $explicit_prefix = length $gentoo_prefix ? undef : $prefix;
		%installed_by = map +( $_ => 1 ), list_locales($explicit_prefix);
	}

	# Filter out locales that are duplicates or that are already installed.
	my %requested_by;
	my $i = 0;
	while ($i <= $#locales) {
		my $canonical = $locales[$i][2];
		my $normal = normalize($canonical);
		if ($requested_by{$normal}++ || $installed_by{$normal}) {
			splice @locales, $i, 1;
		} else {
			++$i;
		}
	}

	# If a non-actionable update was requested, proceed no further.
	if (! @locales) {
		print "All of the requested locales are presently installed.\n";
		exit;
	}

	# A proxy check is justified because compilation may take a long time.
	check_archive_dir($prefix, $locale_dir);

	# Create a temporary directory and switch to it.
	$TEMPDIR = enter_tempdir($prefix);

	# Compile the selected locales.
	generate_locales($opt{'jobs'}, @locales);

	# Integrate the newly compiled locales into the system's locale archive.
	my $size = do {
		my @canonicals = map +( $_->[2] ), @locales;
		generate_archive($prefix, $gentoo_prefix, $locale_dir, $opt{'update'}, @canonicals);
	};

	my $total = scalar @locales + scalar %installed_by;
	printf "Successfully installed an archive containing %d locale%s, of %s MiB in size.\n",
		$total, plural($total), round($size / 2 ** 20);
}

sub get_locale_dir () {
	my $stdout = qx{ LC_ALL=C localedef --help 2>/dev/null };
	if ($? == 0 && $stdout =~ m/\hlocale path\h*:\s+(\/[^:]+)/) {
		return canonpath($1);
	} elsif (($? & 0x7F) == 0) {
		# The child terminated normally (in the sense of WIFEXITED).
		return undef;
	} else {
		throw_child_error('localedef');
	}
}

sub detect_gentoo_prefix ($path) {
	if ($path !~ s/\/usr\/lib\/locale\z//) {
		die "Can't handle unexpected locale directory of '$path'";
	} elsif (length $path && -e "$path/etc/gentoo-release") {
		return $path;
	} else {
		return '';
	}
}

sub parse_opts ($known_prefix, @args) {
	my @options = (
		[ 'config|c=s' => "The file containing the chosen locales (default: $known_prefix/etc/locale.gen)" ],
		[ 'all|A'      => 'Select all locales, ignoring the config file' ],
		[ 'update|u'   => 'Skip any chosen locales that are already installed', ],
		[ 'jobs|j=i'   => 'Maximum number of localedef(1) instances to run in parallel' ],
		[ 'prefix|p=s' => 'The prefix of the root filesystem' ],
		[ 'quiet|q'    => 'Only show errors' ],
		[ 'version|V'  => 'Output version information and exit' ],
		[ 'help|h'     => 'Display this help and exit' ]
	);

	# Parse the provided arguments.
	my $parser = Getopt::Long::Parser->new;
	$parser->configure(qw(posix_default bundling_values no_ignore_case));
	my %opt;
	{
		# Decorate option validation errors while also not permitting
		# for more than one to be reported.
		local $SIG{'__WARN__'} = sub ($error) { die "$PROGRAM: $error" };
		$parser->getoptionsfromarray(\@args, \%opt, map +( $_->[0] ), @options);
	}

	# If either --help or --version was specified, exclusively attend to it.
	if ($opt{'help'}) {
		show_usage(@options);
		exit;
	} elsif ($opt{'version'}) {
		show_version();
		exit;
	}

	# Validate the options and option-arguments.
	if ($opt{'all'} && exists $opt{'config'}) {
		die "$PROGRAM: The --all and --config options are mutually exclusive\n";
	} elsif (length $opt{'prefix'} && $opt{'prefix'} !~ m/^\//) {
		die "$PROGRAM: The --prefix option must specify either a null string or an absolute path\n";
	}

	# Assign values for unspecified options that need them.
	if (! exists $opt{'jobs'} || $opt{'jobs'} < 1) {
		$opt{'jobs'} = get_nprocs() || 1;
	}

	# Replace the special <hyphen-minus> operand with "/dev/stdin".
	if (exists $opt{'config'} && $opt{'config'} eq '-') {
		$opt{'config'} = '/dev/stdin';
	}

	return %opt;
}

sub select_config_files ($prefix, %opt) {
	my $path1 = catfile($prefix, '/etc', 'locale.gen');
	my $path2 = catfile($prefix, '/usr/share/i18n', 'SUPPORTED');
	return do {
		if (exists $opt{'config'}) {
			$opt{'config'};
		} elsif ($opt{'all'}) {
			$path2;
		} elsif (exists $ENV{'LOCALEGEN_CONFIG'}) {
			$ENV{'LOCALEGEN_CONFIG'};
		} else {
			$path1, $path2;
		}
	};
}

sub show_usage (@options) {
	print "Usage: locale-gen [OPTION]...\n\n";
	my $pipe;
	if (! open $pipe, "| column -t -s \037") {
		exit 1;
	}
	for my $row (@options) {
		my ($spec, $description) = $row->@*;
		my ($long, $short) = split /[|=]/, $spec;
		printf {$pipe} "-%s, --%s\037%s\n", $short, $long, $description;
	}
	close $pipe;
	print "\nSee also: locale-gen(8), locale.gen(5)\n";
}

sub show_version () {
	print <<~EOF;
	locale-gen $VERSION
	Copyright 2024 Kerin Millar <kfm\@plushkava.net>
	License GPL-2.0-only <https://spdx.org/licenses/GPL-2.0-only.html>
	EOF
}

sub list_locales ($prefix) {
	if (! defined(my $pid = open my $pipe, '-|')) {
		die "Can't fork: $!";
	} elsif ($pid == 0) {
		run_localedef($prefix, '--list-archive');
	} else {
		chomp(my @locales = readline $pipe);
		if (-1 == waitpid($pid, 0) || $? != 0) {
			die "$PROGRAM: Can't obtain a list of the presently installed locales\n";
		}
		return @locales;
	}
}

sub normalize ($canonical) {
	# This is similar to the normalize_codeset() function of localedef(1).
	if ($canonical !~ m/(?<=\.)[^@]+/p) {
		die "Can't normalize " . render_printable($canonical);
	} else {
		# en_US.UTF-8 => en_US.utf8
		# de_DE.ISO-8859-15@euro => de_DE.iso885915@euro
		my $codeset = lc ${^MATCH} =~ tr/0-9A-Za-z//cdr;
		return ${^PREMATCH} . $codeset . ${^POSTMATCH};
	}
}

sub read_config ($prefix, @paths) {
	# Compose a dictionary of locale names known to be valid.
	my %locale_by = map +( $_ => 1 ), get_valid_locales($prefix);

	# Compose a dictionary of character maps known to be valid.
	my %charmap_by = map +( $_ => 1 ), get_valid_charmaps($prefix);

	# Iterate over the given paths and return the first non-empty list of
	# valid locale declarations that can be found among them, if any.
	for my $i (keys @paths) {
		my $path = $paths[$i];
		my $fh;
		try {
			$fh = fopen($path);
		} catch ($e) {
			# Disregard open(2) errors concerning non-existent files
			# unless there are no more paths to be tried.
			if ($! == ENOENT && $i < $#paths) {
				next;
			} else {
				die $e;
			}
		}
		my @locales = parse_config($fh, $path, \%locale_by, \%charmap_by);
		if (my $count = scalar @locales) {
			printf "Found %d locale declaration%s in '%s'.\n",
				$count, plural($count), $path;
			return @locales;
		}
	}

	# For no locales to have been discovered at this point is exceptional.
	my $path_list = render_printable(scalar @paths == 1 ? $paths[0] : \@paths);
	die "$PROGRAM: No locale declarations were found within $path_list\n";
}

sub get_valid_locales ($prefix) {
	my $cmd = qq{ find . ! -path . -prune ! -path '*\n*' -type f -exec grep -lxF LC_IDENTIFICATION {} + };
	my $top = catdir($prefix, '/usr/share/i18n/locales');
	my $pwd = getcwd();
	if (! chdir $top) {
		die "$PROGRAM: Can't chdir to '$top': $!\n";
	} elsif (! (my @paths = readpipe $cmd) || $? != 0) {
		die "$PROGRAM: Failed to compose a list of valid locale names from '$top'\n";
	} elsif (defined $pwd && ! chdir $pwd) {
		die "$PROGRAM: Can't chdir to '$pwd': $!\n";
	} else {
		chomp @paths;
		return map +( (splitpath($_))[-1] ), @paths;
	}
}

sub get_valid_charmaps ($prefix) {
	my $top = catdir($prefix, '/usr/share/i18n/charmaps');
	if (! opendir my $dh, $top) {
		die "$PROGRAM: Can't open '$top' for reading: $!\n";
	} elsif (! (my @names = map +( -f "$top/$_" ? s/\.(gz|bz2)\z//nr : () ), readdir $dh)) {
		die "$PROGRAM: Failed to compose a list of valid character maps from '$top'\n";
	} else {
		return @names;
	}
}

sub parse_config ($fh, $path, $locale_by, $charmap_by) {
	# Set up a helper routine to throw for validation errors.
	my $thrower = sub ($error, $line) {
		die sprintf "%s: %s at %s[%d]: %s\n",
			$PROGRAM, $error, $path, $., render_printable($line);
	};

	my @locales;
	while (my $line = readline $fh) {
		# Skip comments and blank lines. Note that \h will match only
		# " " and "\t", since the input stream is not being decoded.
		next if $line =~ m/^\h*($|#)/n;

		# Permit comments trailing locale declarations.
		$line =~ s/\h\K#\h.*//;

		# Expect for two fields, separated by horizontal whitespace.
		my @fields;
		chomp $line;
		if (2 != (@fields = split /\h+/, trim_line($line), 3)) {
			$thrower->('Malformed locale declaration', $line);
		}

		# Extract the specified locale and character map. Upon success,
		# a canonicalised representation of the locale is also returned.
		my ($locale, $codeset, $charmap, $canonical) = parse_entry(@fields);

		# Validate both locale and character map before accepting.
		if (! $locale_by->{$locale}) {
			$thrower->('Invalid locale', $line);
		} elsif (defined $codeset && $codeset ne $charmap) {
			$thrower->('Mismatching codeset/charmap', $line);
		} elsif (! $charmap_by->{$charmap}) {
			$thrower->('Invalid charmap', $line);
		} else {
			push @locales, [ $locale, $charmap, $canonical ];
		}
	}

	return @locales;
}

sub parse_entry ($locale, $charmap) {
	my $canonical;
	my $codeset;
	if (2 == (my @fields = split /@/, $locale, 3)) {
		# de_DE@euro ISO-8859-15 => de_DE.ISO-8859-15@euro
		$canonical = sprintf '%s.%s@%s', $fields[0], $charmap, $fields[1];
	} elsif (2 == (@fields = split /\./, $locale, 3)) {
		# en_US.UTF-8 UTF-8 => en_US.UTF-8
		($locale, $codeset) = @fields;
		$canonical = "$locale.$codeset";
	} elsif (1 == @fields) {
		# en_US ISO-8859-1 => en_US.ISO-8859-1
		$canonical = "$locale.$charmap";
	}
	return $locale, $codeset, $charmap, $canonical;
}

sub check_archive_dir ($prefix, $locale_dir) {
	my $archive_dir = local $ENV{'DIR'} = catdir($prefix, $locale_dir);

	# Quietly attempt to create the directory if it does not already exist.
	system q{ mkdir -p -- "$DIR" 2>/dev/null };

	# Check whether the directory exists and can be modified by the EUID.
	if (! utime undef, undef, $archive_dir) {
		my $username = get_username();
		die "$PROGRAM: Aborting because '$username' can't modify '$archive_dir': $!\n";
	}
}

sub enter_tempdir ($prefix) {
	# Given that /tmp might be a tmpfs, prefer /var/tmp so as to avoid
	# undue memory pressure.
	my $dir = catdir($prefix, '/var/tmp');
	if (! -d $dir) {
		$dir = File::Spec->tmpdir;
	}
	my $tmpdir = tempdir('locale-gen.XXXXXXXXXX', 'DIR' => $dir);
	if (! chdir $tmpdir) {
		die "$PROGRAM: Can't chdir to '$tmpdir': $!\n";
	} else {
		return $tmpdir;
	}
}

sub generate_locales ($workers, @locales) {
	# Trap SIGINT and SIGTERM so that they may be handled gracefully.
	my $handler = sub ($signal) { $DEFERRED_SIGNAL ||= $signal };
	local @SIG{'INT', 'TERM'} = ($handler, $handler);

	my $total = scalar @locales;
	if ($total < $workers) {
		$workers = $total;
	}
	printf "Compiling %d locale%s with %d worker%s ...\n",
		$total, plural($total), $workers, plural($workers);

	my $num_width = length $total;
	my %status_by;
	for my $i (keys @locales) {
		# Ensure that the number of concurrent workers is bounded.
		if ($i >= $workers) {
			my $pid = wait;
			last if 0 != ($status_by{$pid} = $?);
		}

		my ($locale, $charmap, $canonical) = $locales[$i]->@*;
		printf "[%*d/%d] Compiling locale: %s\n",
			$num_width, $i + 1, $total, $canonical;

		# Fork and execute localedef(1) for locale compilation.
		if (! defined(my $pid = fork)) {
			warn "Can't fork: $!";
			last;
		} elsif ($pid == 0) {
			@SIG{'INT', 'TERM'} = ('DEFAULT', 'DEFAULT');
			compile_locale($locale, $charmap, $canonical);
		}
	} continue {
		last if $DEFERRED_SIGNAL;
	}

	# Reap any subprocesses that remain.
	if ($workers > 1) {
		print "Waiting for active workers to finish their jobs ...\n";
	}
	while (-1 != (my $pid = wait)) {
		$status_by{$pid} = $?;
	}

	# Abort if any of the collected status codes are found to be non-zero.
	# In the case that one subprocess was interrupted by a signal while
	# another exited non-zero, the resulting diagnostic shall allude to the
	# signal. Such determinism is achieved by sorting the values.
	for my $status (sort { $a <=> $b } values %status_by) {
		throw_child_error('localedef', $status);
	}

	if ($DEFERRED_SIGNAL) {
		# The signal shall be propagated by the END block.
		exit;
	} elsif (scalar %status_by != $total) {
		die "$PROGRAM: Aborting because not all of the selected locales were compiled\n";
	}
}

sub compile_locale ($locale, $charmap, $canonical) {
	my $output_dir = "./$canonical";
	my @args = ('--no-archive', '-i', $locale, '-f', $charmap, '--', $output_dir);
	run_localedef(undef, @args);
}

sub generate_archive ($prefix, $gentoo_prefix, $locale_dir, $do_update, @canonicals) {
	# Create the temporary subdir that will contain the new locale archive.
	my $output_dir = catdir('.', $gentoo_prefix, $locale_dir);
	run('mkdir', '-p', '--', $output_dir);

	# Determine the eventual destination path of the archive.
	my $final_path = catfile($prefix, $locale_dir, 'locale-archive');
	print "The location of the archive shall be '$final_path'.\n";

	# If --update was specified, make a copy of the existing archive.
	if ($do_update && -e $final_path) {
		run('cp', '--', $final_path, "$output_dir/");
	}

	# Integrate all of the compiled locales into the new locale archive.
	my $total = scalar @canonicals;
	printf "Adding %d locale%s to the locale archive ...\n", $total, plural($total);
	my $stderr = fopen('stderr.log', '+>');
	redirect_stderr($stderr, sub {
		my @args = ('--quiet', '--add-to-archive', '--replace', '--', @canonicals);
		run_localedef('.', @args);
	});

	# Propagate the diagnostics and errors raised by localedef(1), if any.
	seek $stderr, 0, SEEK_SET;
	my $i = 0;
	while (my $line = readline $stderr) {
		warn $line;
		++$i;
	}
	close $stderr;

	# Check the status code first.
	throw_child_error('localedef');

	# Sadly, the exit status of GNU localedef(1) is nigh on useless in the
	# case that the --add-to-archive option is provided. If anything was
	# printed to STDERR at all, act as if the utility had exited 1.
	if ($i > 0) {
		throw_child_error('localedef', 1 << 8);
	}

	# The process of replacing the old archive must not be interrupted.
	local @SIG{'INT', 'TERM'} = ('IGNORE', 'IGNORE');

	# Move the newly minted archive into the appropriate filesystem. Use
	# mv(1), since there is a chance of crossing a filesystem boundary.
	my $interim_path = "$final_path.$$";
	run('mv', '--', catfile($output_dir, 'locale-archive'), $interim_path);

	# Atomically replace the old archive.
	if (! rename $interim_path, $final_path) {
		{
			local $!;
			unlink $interim_path;
		}
		die "$PROGRAM: Can't rename '$interim_path' to '$final_path': $!\n";
	}

	# Return the size of the archive, in bytes.
	if (! (my @stat = stat $final_path)) {
		die "$PROGRAM: Can't stat '$final_path': $!\n";
	} else {
		return $stat[7];
	}
}

sub run_localedef ($archive_prefix, @args) {
	# Incorporate the --prefix option, if requested. Its only effect is to
	# cause localedef(1) to prepend the option-arg to the archive path.
	if (length $archive_prefix) {
		unshift @args, '--prefix', $archive_prefix;
	}

	# Prevent the --verbose option from being potentially implied.
	delete local $ENV{'POSIXLY_CORRECT'};

	# Execute localedef(1). Don't fork if doing so from a child process.
	my @cmd = ('localedef', @args);
	if ($$ == $PID) {
		system @cmd;
	} elsif (! exec @cmd) {
		exit 1;
	}
}

sub fopen ($path, $mode = '<') {
	if (! open my $fh, $mode, $path) {
		die "$PROGRAM: Can't open '$path': $!\n";
	} elsif (! -f $fh && canonpath($path) !~ m/^\/dev\/(null|stdin)\z/n) {
		die "$PROGRAM: Won't open '$path' because it is not a regular file\n";
	} else {
		return $fh;
	}
}

sub get_nprocs () {
	chomp(my $nproc = qx{ { nproc || getconf _NPROCESSORS_CONF; } 2>/dev/null });
	return $nproc;
}

sub plural ($int) {
	return $int == 1 ? '' : 's';
}

sub redirect_stderr ($stderr, $callback) {
	if (! open my $old_stderr, '>&', *STDERR) {
		die "Can't dup STDERR to a new file descriptor: $!";
	} elsif (! open *STDERR, '>&', $stderr) {
		my $fileno = fileno $stderr;
		die "Can't dup file descriptor #$fileno to STDERR: $!";
	} else {
		$callback->();
		open *STDERR, '>&=', $old_stderr;
	}
}

sub render_printable ($value) {
	require JSON::PP;
	return JSON::PP->new->ascii->space_after->encode($value)
}

sub run ($cmd, @args) {
	system $cmd, @args;
	throw_child_error($cmd);
}

sub throw_child_error ($cmd, $status = $?) {
	if ($status == -1) {
		# The program could not be started. Since Perl will already
		# have printed a warning, no supplemental diagnostic is needed.
		exit 1;
	} elsif ($status != 0) {
		my $fate = ($status & 0x7F) ? 'interrupted by a signal' : 'unsuccessful';
		die "$PROGRAM: Aborting because the execution of '$cmd' was $fate\n";
	}
}

sub trim_line ($line) {
	$line =~ s/^\h+//;
	$line =~ s/\h+$//;
	return $line;
}

sub get_username () {
	local $!;
	return getpwuid($>) // $ENV{'LOGNAME'};
}

sub round ($number) {
	# Evaluation conveniently trims insignificant trailing zeroes.
	return eval(sprintf '%.2f', $number);
}

END {
	if ($$ == $PID) {
		if (length $TEMPDIR) {
			local $?;
			system 'rm', '-r', '--', $TEMPDIR;
		}

		# The default SIGINT and SIGTERM handlers are suppressed by
		# generate_locales. The former is especially important, per
		# http://www.cons.org/cracauer/sigint.html.
		if ($DEFERRED_SIGNAL) {
			kill $DEFERRED_SIGNAL, $$;
		}
	}
}
