#!/usr/bin/perl my $VERSION = 0.2; package Fileinfo; use strict; use warnings; use File::Basename (); use File::Spec; use File::Listing; *pathjoin = sub{ File::Spec->join(@_) }; # constructor. # new Fileinfo( $local_path ) # new Fileinfo( $remote_path, $ls_entry, $ftp) sub new { my $f = bless +{}, (ref($_[0])? ref(shift) : shift); my($path,$ls_entry, $ftp)= @_; $f->path( $path ); $f->ftp($ftp); $f->size($f->mtime($f->mode($f->exists(0)))); $f->type('?'); $f->alphadog(1); if (defined $ls_entry){ # It's an `ls' entry returned from parse_dir as # [ $name, $type, $size, $mtime, $mode ] $f->type($ls_entry->[1] || '?'); $f->size($ls_entry->[2]); $f->mtime($ls_entry->[3]); $f->mode($ls_entry->[4]); $f->exists(1); } else{ # It's a local path. my @s = lstat $f->path; if (@s){ $f->type ( -f _ ? 'f': -d _ ? 'd': -l _ ? 'l': '?'); $f->size($s[7]); $f->mtime($s[9]); $f->mode($s[2]); $f->exists(1); } } $f->dirname(File::Basename::dirname($f->path)); $f->basename(File::Basename::basename($f->path)); $f->{'ls'} = []; # Don't call method here. return $f; } # create accessors for my $attr (qw( path dirname basename size mtime mode type exists has_ls ftp alphadog)) { no strict 'refs'; *{ __PACKAGE__ . "::" . $attr} = sub { $_[0]->{$attr} = $_[1] if @_ > 1; $_[0]->{$attr} }; } sub is_dir { shift->type eq 'd' } sub is_link { shift->type eq 'l' } sub is_file { shift->type eq 'f' } sub to_string { my $path = shift->path; defined($path) ? $path : 'undef' }; sub ls { my $f = shift; return $f->{'ls'} unless $f->is_dir; if (defined($f->path) && $f->is_dir && not $f->has_ls){ $f->{'ls'} = []; if (defined $f->ftp){ # remote for my $ls_entry (parse_dir($f->ftp->dir($f->path))){ next if $ls_entry->[0] =~ /^\.\.?$/; push @{$f->{'ls'}}, new Fileinfo( pathjoin($f->path, $ls_entry->[0]), $ls_entry, $f->ftp); } $f->has_ls(1); } else{ # local opendir DIR, $f->path or ::fail("Could not opendir ". $f->path); while (my $dir_entry = readdir DIR){ next if $dir_entry =~ /^\.\.?$/; push @{$f->{'ls'}}, new Fileinfo( pathjoin($f->path, $dir_entry)); } close DIR; $f->has_ls(1); } } return $f->{'ls'}; } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- package main; use strict; use warnings; use POSIX qw(setuid setgid); use URI; use Getopt::Long; use Pod::Usage; use Net::FTP; use File::Path (); use File::Find; use File::Basename (); use File::Spec; use File::Listing; use constant { Defer_It => 0, Skip_It => 1, Fetch_It => 2, Delete_It => 4, }; *pathjoin = sub{ File::Spec->join(@_) }; ## OPTIONS my ($opt_verbose, $opt_allow_root) = (0,0); my ($opt_existing, $opt_delete, $opt_debug) = (0,0,0); my ($opt_passive_ftp, $opt_squish, $opt_recurse) = (0,0,0); my $opt_maxdepth = 2**31; my ($opt_setuid, $opt_setgid); my %known_mirrors = ( 'cpan' => \&opt_cpan, 'gnu' => \&opt_gnu ); ## REGEXPS LISTS my (@rx_paths,@rx_basenames); my (@rx_not_paths,@rx_not_basenames); ## PARSE COMMAND LINE GetOptions( 'existing' => \$opt_existing, 'delete' => \$opt_delete, 'passive-ftp!' => \$opt_passive_ftp, 'verbose!' => \$opt_verbose, 'debug!' => \$opt_debug,, 'version' => sub { print "mirror-tightwad $VERSION\n"; exit 0 }, 'squish!' => \$opt_squish, 'allow-root!' => \$opt_allow_root, 'setuid=s' => \$opt_setuid, 'setgid=s' => \$opt_setgid, 'maxdepth=i' => \$opt_maxdepth, 'recurse!' => \$opt_recurse, 'mirror=s' => sub{ fail("Unknown mirror '$_[1]'") unless exists $known_mirrors{$_[1]}; $known_mirrors{$_[1]}->(); }, 'path=s' => sub{ opt_regexp($_[1],\@rx_paths,\@rx_not_paths) }, 'basename=s' => sub{ opt_regexp($_[1],\@rx_basenames,\@rx_not_basenames) }, ) && scalar(@ARGV) == 2 or pod2usage( -exitval => 1, -verbose => 1); $opt_verbose = 1 if $opt_debug; my $uri = new URI($ARGV[0]); $uri->scheme eq 'ftp' or fail("Only ftp:// URIs are supported."); my $local_root = new Fileinfo($ARGV[1]); ## DEFINE THE RULES my (@dir_rules,@file_rules); ## Dir rule: existing push @dir_rules, sub { $_[1]->exists ? Defer_It : Skip_It } if $opt_existing; ## Dir rule: regexps push @dir_rules, sub { return Skip_It if @rx_not_paths && scalar grep { $_[0]->path =~ $_ } @rx_not_paths; return Fetch_It if @rx_paths && scalar grep { $_[0]->path =~ $_ } @rx_paths; Defer_It; }; ## File rule: existing #push @file_rules, sub { $_[1]->exists ? Defer_It : Skip_It } if $opt_existing; ## File rule: regexps push @file_rules, sub { return Skip_It if @rx_not_basenames && scalar grep{$_[0]->basename =~ $_} @rx_not_basenames; return Skip_It if @rx_not_paths && scalar grep {$_[0]->path =~ $_} @rx_not_paths; return Fetch_It if @rx_basenames && scalar grep { $_[0]->basename =~ $_ } @rx_basenames; return Skip_It if @rx_basenames; return Fetch_It if @rx_paths && scalar grep { $_[0]->path =~ $_} @rx_paths; return Skip_It if @rx_paths; Defer_It; }; ## Running as root must be explictly allowed if ( $> == 0 ) { ## we're running as root. if (not $opt_allow_root){ if (not defined($opt_setuid) or not defined($opt_setgid)){ fail("Refusing to run as root. Use --allow-root or --setuid=USER --setgid=GROUP"); } my $uid = $opt_setuid =~ /^\d+$/ ? $opt_setuid : getpwnam $opt_setuid; my $gid = $opt_setgid =~ /^\d+$/ ? $opt_setgid : getgrnam $opt_setgid; verbose("Setting gid to $gid\n"); POSIX::setgid($gid); verbose("Setting uid to $uid\n"); POSIX::setuid($uid); } else{ verbose("running as superuser"); } } debug("uri: " . $uri->as_string ); debug("uri_path: " . $uri->path ); debug("local_root: " . $local_root->path); ## CONNECT verbose("Opening " . ($opt_passive_ftp ? 'passive ' : ' ') . $uri->scheme . " connection to " . $uri->host . " port " . $uri->port); my $ftp = Net::FTP->new($uri->host, Passive => $opt_passive_ftp) or fail($@); ## LOGIN verbose("Login"); $ftp->login or fail("Could not login"); ## CHDIR verbose("Cwd " . $uri->path); $ftp->cwd($uri->path) or fail("Could not cwd to remote path: " . $uri->path); ## Pretend that we got a dir list entry for the remote root ## -- because dir("..") will not work at the top level. my $remote_pwd = $ftp->pwd; my $remote_root = new Fileinfo($remote_pwd, [ $remote_pwd , 'd', 1, time(), 0555 ], $ftp); ftp_dir_walk($ftp, $remote_root, $local_root, 0); verbose("exiting"); exit 0; ###################################################################### ###################################################################### # FTP recursive descent. sub ftp_dir_walk { my ($ftp, $remote_dir, $local_dir, $depth) = @_; #debug("ftp_dir_walk( " . $remote_dir->path .", ". $local_dir->path . ")\n"); verbose("Reading directory " . $remote_dir->path ); debug("Local directory is " . $local_dir->path ); my $entries = $remote_dir->ls; build_squishes($entries); my %touch; # local_basename => [remote,local] ## Call file/link rules foreach my $new_remote_file ( grep { $_->is_file || $_->is_link } @$entries) { my $new_local_file = new Fileinfo(pathjoin($local_dir->path, $new_remote_file->basename)); $touch{$new_local_file->basename} = [$new_remote_file, $new_local_file] if should_fetch_file($new_remote_file, $new_local_file); } ## Delete files not in the touch hash. if ($opt_delete ){ for (grep{$_->is_file||$_->is_link } @{$local_dir->ls}){ if (not exists $touch{$_->basename}){ verbose("unlinking " . $_->path ); unlink $_->path; } } } ## Touch the files we want rsync to update for my $a (values %touch){ touch_file(@$a); } ## Maxdepth cutoff return if $depth >= $opt_maxdepth; ## Recurse return unless $opt_recurse; for my $new_remote_dir ( grep {$_->is_dir} @$entries ){ my $new_local_dir = new Fileinfo(pathjoin($local_dir->path, $new_remote_dir->basename)); ## Call dir rules my $action = apply_rules($new_remote_dir,$new_local_dir,\@dir_rules, Fetch_It); next if $action == Skip_It; ## Recurse ftp_dir_walk ($ftp, $new_remote_dir, $new_local_dir, $depth+1); } } ## Make the big decision. sub should_fetch_file { my ($R, $L) = @_; ## Don't even look at it unless it passes the regexp rules return 0 unless apply_rules($R, $L, \@file_rules, Skip_It) == Fetch_It; ## Squish unless it is the alpha dog. return 0 if $opt_squish && not $R->alphadog; ## Apply the 'existing' constraint. if ($opt_existing){ # TODO } ## Ok return 1; } sub touch_file { my ($remote_file, $local_file) = @_; ## Create the local file if it doesn't exist. rsync will do the actual fetch. return if $local_file->exists; File::Path::mkpath($local_file->dirname, ($opt_verbose? 1:0), 0777); verbose("Touching file " . $local_file->path ); open TOUCH, ">" . $local_file->path or fail("Could not create local file " . $local_file->path); print TOUCH $remote_file->size, "\n"; close TOUCH; ## Now set the timestamp to a long, long time ago. my $mtime = $remote_file->mtime - (365 * 24 * 60 * 60); # about one year ago utime $mtime, $mtime, $local_file->path; } sub delete_file { my ($remote_file, $local_file) = @_; } ## Applies the regexp rules and returns what to do. ## Returns undef for default action. ## apply_rules($remote,$local, $rules_list) sub apply_rules { my ($remote_file, $local_file, $rules, $default) = @_; RULE: foreach my $r (@$rules){ SWITCH: for ($r->($remote_file, $local_file)){ next unless defined($_) && /^\d+$/; if($_ == Skip_It) { return $_ } if($_ == Fetch_It){ return $_ } } } return $default; } ## Find the most recent packages in a list of files sub build_squishes { my ($ls) = @_; return unless $opt_squish; my %squishes; # package => [ Fileinfo ... ] FILE: for my $f (grep { $_->is_file } @$ls){ for my $rx (@rx_basenames){ next unless $f->basename =~ $rx; # package must be in $1 my $package = $1; push @{$squishes{$package}}, $f; next FILE; } } # Sort the squishes by mtime for my $pkg (keys %squishes){ $squishes{$pkg} = [ sort { $b->mtime <=> $a->mtime } @{$squishes{$pkg}} ]; } ## Mark the alphadog and unmark the rest for my $a ( values %squishes){ for my $f (@$a){ $f->alphadog(0); } $a->[0]->alphadog(1); } } ## Callback for --FOO=regexp options. sub opt_regexp { my ($expr, $list, $notlist) = @_; if ( $expr =~ /^\!/){ $list = $notlist; $expr =~ s/^\!//; } ## Syntax check the expression eval { 'foo' =~ m/$expr/ }; fail("Invalid regular expression: $expr\n$@\n") if $@; push @$list, qr/$expr/; } # callback for option --cpan sub opt_cpan { my $ext = qr/\btar\.(?:gz|bz2)/; my $rev = qr/\d+[\d\.]*/; push @rx_basenames, qr/^ ([\w-]+) -? $rev \. $ext $/x; $opt_squish = 1; $opt_recurse = 1; } # callback for option --gnu sub opt_gnu { my $ext = qr/\btar\.(?:gz|bz2)/; my $rev = qr/\d+[\d\.]*/; push @rx_basenames, qr/^ ([\w-]+) -? $rev \. $ext $/x; $opt_squish = 1; $opt_recurse = 1; } sub verbose { print_safe(\*STDOUT, @_) if $opt_verbose } sub debug { print_safe(\*STDOUT, @_) if $opt_debug } sub print_safe { my $fh = shift; return unless scalar( my @a = map { defined($_)?$_:'undef' } @_); print $fh (my $s = join ' ', @a); print $fh "\n" if $s !~ m/\n$/; } sub fail{ print_safe(\*STDERR,@_); exit(1) } 1 __END__ =head1 NAME mirror-tightwad - Create a subset of a mirror =head1 SYNOPSIS mirror-tightwad [OPTIONS] ftp://remote.host/path/ /local/path/ =head1 DESCRIPTION mirror-tightwad is a utility to mirror only a subset of a large site. The goal is to save hard disk space by only mirroring what you want. mirror-tightwad is designed to be a pre-processor for rsync. First you run mirror-tightwad, then you run rsync (or wget, etc.). The first argument is a URL for the remote server. Currently only ftp is supported. The second argument is the local mirror directory. By default, mirror-tightwad assumes you are a total tightwad, and will not mirror anything at all. You have to explicitly ask for recursion, and provide regular expressions to match against the files you want to grab. See L. =head1 PREREQUISITES This script requires the following modules: C, C, C, C, C, C, C, C, C, C, C, C, C =head1 SQUISHING Let's say the remote directory /pub/gnu/autoconf contains, autoconf-2.55.tar.bz2 autoconf-2.55.tar.gz autoconf-2.56.tar.bz2 autoconf-2.56.tar.gz autoconf-2.57.tar.bz2 <--- The most recent. autoconf-2.57.tar.gz but you just want to mirror whichever one is the most recent. You can do this with option B<--squish>, which tells mirror-tightwad that all the C files in a directory belong to the same logical package, and to grab only the most recent file in that package. The files are squished according to the pattern defined by B<--basename>, which is matched against the basename of the file. The first set of capturing parenthesis is the package name. The above filenames could be squished like this: mirror-tightwad --basename='^(\w+)-[\d.]+\.tar\.(?:gz|bz2)$' ... Since the first set of capturing parenthesis is C<(\w+)>, this pattern would squish the above filenames into the package ``autoconf''. Then, the file with most recent modification time will be chosen, and the rest of the files in the autoconf package will be ignored. =head1 PATTERNS All patterns are Perl 5 regular expressions, with one extra helping of sugar: if the first character is '!', then the matching will be negated. Perl 5 regular expression syntax is not documented here. See L, L, and L. All negated patterns are attempted first, then normal patterns. If any negated pattern succeeds, then no further matches are attempted. Remember to use C<^> and C<$> (or C<\A> and C<\z>) where needed. =head1 OPTIONS =over =item B<--allow-root> Allows mirror-tightwad to run as superuser. If this option is not given, then mirror-tightwad refuses to run as superuser... but see B<--setuid>. =item B<--basename=REGEXP> Considers only the remote file whose basename matches I. Multiple B<--basename> options may be specified. Matching stops on the first successful match. If REGEXP starts with '!', then the match is negated. All negated patterns are attempted before any normal matches. Don't forget to use C<^> and C<$> (or C<\A> and C<\z>) where needed. =item B<--debug> Prints a lot of information useful for debugging. =item B<--delete> Deletes local files that are not scheduled to be mirrored. This is useful for deleting older versions of tarballs as new versions arrive. =item B<--existing> Only consider files or directories that already exist locally. =item B<--maxdepth=NUM> Set the maximum recursion depth to NUM. The default is very large. =item B<--mirror=NAME> Shortcut for setting options for well-known mirrors. NAME can be one of I, or I. Feel free to override the settings by providing additional options. For example, C<--mirror=GNU> is similar to specifying, --recurse --squish --basename='^([\w-]+)-\d+[\d\.]*\.(tar\.(?:gz|bz2))$' =item B<--passive-ftp> Enables passive FTP mode. Useful if you are behind a firewall. =item B<--path=REGEXP> Considers only the remote files whose full path matches I. Multiple B<--basename> options may be specified. Matching stops on the first successful attempt. If REGEXP starts with '!', then the match is negated. All negated patterns are attempted before any normal matches. Don't forget to use C<^> and C<$> (or C<\A> and C<\z>) where needed. =item B<--recurse> Enables directory recursion. =item B<--setuid=USER> Sets the user id before proceeding. I can be a name or numeric user id. =item B<--setgid=GROUP> Sets the group id before proceeding. I can be a name or numeric group id. =item B<--squish> Enable squishing defined by the first capturing parenthesis in B<--basename>. See L. =item B<--verbose> Prints details about what mirror-tightwad is doing. =item B<--version> Prints version information and exit. =back =head1 EXAMPLES =over =item 1. Get the latest GNU tarballs (saves about ???MB) mirror-tightwad --gnu --delete ftp://ftp.gnu.org/pub/gnu/ /mirrors/gnu/ rsync -av --existing --delete ftp.gnu.org::pub/gnu/ /mirrors/gnu/ =item 2. Same as above, but don't get the manuals. mirror-tightwad --gnu --delete --path='!/Manuals/' ftp://ftp.gnu.org/pub/gnu/ /mirrors/gnu/ rsync -av --existing --delete ftp.gnu.org::pub/gnu/ /mirrors/gnu/ =item 3. This time, only scan the the first level of directories. mirror-tightwad --gnu --delete --maxdepth=1 --path='!/Manuals/' ftp://ftp.gnu.org/pub/gnu/ /mirrors/gnu/ rsync -av --existing --delete ftp.gnu.org::pub/gnu/ /mirrors/gnu/ =item 4. Only mirror what you already have mirror-tightwad --existing --basename='.+' ftp://host/stuff/ ~/stuff/ rsync -av --existing --delete ftp://host/stuff ~/stuff =back =head1 TROUBLESHOOTING Did you forget B<--recurse>? Did you specify a B<--basename> pattern? Behind a firewall? Then use B<--passive-ftp>. =head1 BUGS Option --existing doesn't squish. Option --quota is not yet implemented. =pod SCRIPT CATEGORIES Networking UNIX/System_administration =head1 AUTHOR John Millaway =cut vim:set ft=perl ai si et ts=4 sts=4 sw=4 tw=0: