#! /usr/bin/perl -w
# -*- cperl -*-

# e2recover.in -- try to automatically undelete files from an ext2 file system
#
# Aaron Crane <aaronc@pobox.com>
#
# This file is part of `e2recover', a suite of programs which assist in
# recovering deleted files from ext2 file systems.  `e2recover' is
# Copyright  1997, 1998, 1999 Aaron Crane <aaronc@pobox.com>.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

use strict;
use Getopt::Long;
use POSIX;
use integer;

sub long_usage {
    print <<"EOF";
Usage: $0 [OPTION]... [LSDEL-FILES]...

Attempt to recover deleted files from an ext2 file system, using the output
(in LSDEL-FILES) from the "lsdel" command in debugfs.  Standard input is
read if no file names are given or on a file name of \`-\'.  Recovered files
are written to the appropriate temporary directory, with names like
\`e2rec.PID.DEV.INUM\'.  Uses environment variables \$FSGRAB and \$DEBUGFS
to find those programs, or looks in \$PATH if they are unset.

  -b, --block-size=BLOCKSIZE   the file system has blocks of BLOCKSIZE
                                  bytes (default: 1024)
  -d, --device=DEVICE          the file system is on DEVICE
                                  (default: /dev/hda1)
  -g, --guess-indirects        try to recover files with zeroed indirect
                                  blocks by assuming that there was no
                                  fragmentation in that file
  -t, --tmpdir=TMPDIR,         write recovered files to TMPDIR
      --tempdir=TMPDIR            (default: \${TMPDIR:-/tmp})
      --help                   display this help and exit
      --version                display version information and exit

BLOCKSIZE may have an optional multiplier suffix: w for 2, b for 512,
k for 1024, m for 1Meg.  BLOCKSIZE must be an exact multiple of 512 bytes.

See <URL:http://pobox.com/~aaronc/tech/e2-undel/> for more information
(including a description of how it works its magic, and the situations in
which it fails).

Report bugs to <aaronc\@pobox.com>.
EOF
    exit 0
}

sub show_version {
    print <<'EOF';
e2recover 1.0
Copyright (C) 1998, 1999 Aaron Crane
e2recover comes with NO WARRANTY, to the extent permitted by law.
You may modify and redistribute copies of e2recover under the terms
of the GNU General Public License.  For more information about these
matters, see the file named COPYING.
EOF
    exit 0;
}

sub short_usage {
    print STDERR "Try `$0 --help' for more information\n";
    exit 1;
}

my $fsgrab  = $ENV{FSGRAB}  || 'fsgrab';
my $debugfs = $ENV{DEBUGFS} || 'debugfs';

sub set_details {
    my ($euid, $atime, $mtime, $mode, $owner, $group, $file) = @_;
    utime $atime, $mtime, $file
      or print "  can't set access and modification time on $file: $!\n";
    chmod $mode, $file
      or print "  can't set mode of $file to ", sprintf("0%o", $mode), ": $!\n";
    if ($euid == 0) {
	chown $owner, $group, $file
	  or print "  can't set ownership of $file to $owner:$group: $!\n";
    }
}

sub push_compressed_block_list_entry {
    my ($listref, $start, $end) = @_;
    push @$listref, [$start, $end - $start + 1]
      if $start != -1;
}

sub compress_block_list {
    # We'll put the compressed list in here
    my @new_list;
    # The starting block number and last block number for contiguous sequences
    my ($start, $last) = (-1, -1);
    foreach (@_) {
	if ($_ == $last + 1) {
	    # It's part of the entry we're building, so just add it in
	    $last = $_;
	} else {
	    # Doesn't fit into an existing sequence, so push the sequence we
	    # have, and start over for next time
	    push_compressed_block_list_entry \@new_list, $start, $last;
	    $start = $_;
	}
	$last = $_;
    }
    # And write out the last entry we were building
    push_compressed_block_list_entry \@new_list, $start, $last;
    return @new_list;
}

sub remove_indirects {
    my ($fs_blocksize, $listref) = @_;
    my @list;
    my @block_number_in_a_block = (1 .. $fs_blocksize / 4);

    # Keep the direct data blocks
    push @list, splice(@$listref, 0, 12);
    return @list unless @$listref;

    # Drop the inode's indirect block
    shift @$listref;
    return @list unless @$listref;

    # Keep the data blocks from that indirect block
    push @list, splice(@$listref, 0, $fs_blocksize / 4);
    return @list unless @$listref;

    # Drop the inode's doubly-indirect block
    shift @$listref;
    return @list unless @$listref;

    # Process indirect blocks in the doubly-indirect
    foreach (@block_number_in_a_block) {
	# Drop the indirect block from the inode's doubly-indirect
	shift @$listref;
	return @list unless @$listref;

	# Keep the data blocks from that indirect block
	push @list, splice(@$listref, 0, $fs_blocksize / 4);
	return @list unless @$listref;
    }

    # Drop the inode's triply-indirect block
    shift @$listref;
    return @list unless @$listref;

    # Process doubly-indirect blocks in the triply-indirect
    foreach (@block_number_in_a_block) {
	# Drop the doubly-indirect block
	shift @$listref;
	return @list unless @$listref;

	# Process indirect blocks in the doubly-indirect
	foreach (@block_number_in_a_block) {
	    # Drop the indirect block
	    shift @$listref;
	    return @list unless @$listref;

	    # Keep the data blocks
	    push @list, splice(@$listref, 0, $fs_blocksize / 4);
	    return @list unless @$listref;
	}
    }

    # Fancy that!  It was the largest file that could possibly be stored in
    # an ext2 fs with that block size.  Well, either that or I screwed up
    # somewhere...
    return @list;
}

sub run_it {
    #print 'running: [', join('] [', @_), "]\n";
    system @_;
}

sub minus_m_arg {
    my ($size, $fs_blocksize, $blocks) = @_;
    my $len = $fs_blocksize * $blocks;
    $size %= $fs_blocksize;
    if ($size) {
	$len -= $fs_blocksize - $size;
	return ('-m', $len);
    }
    return ();
}

# This function should be called with all the blocks that make up a given
# file.
sub run_fsgrab {
    my ($recovery_name, $size, $device, $fs_blocksize, @blocks) = @_;
    my @cmdlist = ($fsgrab, '-b', $fs_blocksize, '-o', $recovery_name);

    # Compress the block list for efficiency in terms of number of processes forked
    @blocks = compress_block_list (@blocks);

    # Do the first one; no append
    my $cblref = shift(@blocks);
    splice @$cblref, 1, 0, '-c';
    push @$cblref, minus_m_arg($size, $fs_blocksize, $$cblref[2])
      if @blocks == 0;
    run_it @cmdlist, '-s', @$cblref, $device;

    # We'll need append on the rest
    push @cmdlist, '-a';

    # Do the remainder.  Don't forget -m MAXIMUM on the last one.
    my ($i, $max) = (0, $#blocks);
    foreach $cblref (@blocks) {
	splice @$cblref, 1, 0, '-c';
	push @$cblref, minus_m_arg($size, $fs_blocksize, $$cblref[2])
	  if $i == $max;
	run_it @cmdlist, '-s', @$cblref, $device;
	$i++;
    }
}

sub push_indirect {
    my ($listref, $block_count, $fs_blocksize, $indirect) = @_;
    push @$listref, $indirect;
    push @$listref, ($indirect + 1 .. $indirect + $fs_blocksize / 4);
}

sub push_indirect2 {
    my ($listref, $block_count, $fs_blocksize, $indirect2) = @_;
    push @$listref, $indirect2;
    my $indirect = $indirect2;
    foreach (1 .. $fs_blocksize / 4) {
	push_indirect $listref, $block_count, $fs_blocksize, ++$indirect;
	return if @$listref >= $block_count;
	$indirect += $fs_blocksize / 4;
    }
}

sub push_indirect3 {
    my ($listref, $block_count, $fs_blocksize, $indirect3) = @_;
    push @$listref, $indirect3;
    my $indirect2 = $indirect3;
    foreach (1 .. $fs_blocksize / 4) {
	push_indirect2 $listref, $block_count, $fs_blocksize, ++$indirect2;
	return if @$listref >= $block_count;
	$indirect2 += $fs_blocksize / 4;
    }
}

# This Getopt::Long module is pretty convenient.  The only problem is that
# it considers a lone `-' to be an option (whose name is the empty string)
# rather than an argument (namely, stdin).  My fix for this is kind of a
# kludge.  Before doing the getopt, I replace any argument consisting of a
# single dash with a string containing only a null character.  This can't
# have been passed by the user on the command line, because command lines
# (and environment variables) can't contain null characters.  Finally I
# translate such arguments back into a single minus so they will be open()ed
# correctly.

foreach (@ARGV) { s/^-$/\000/ }

Getopt::Long::config(qw(auto_abbrev permute bundling no_getopt_compat));

my $device = "/dev/hda1";
my $tmpdir = ($ENV{TMPDIR} or "/tmp");
my $guess_indirects = 0;
my $fs_blocksize = 1024;
my $requested_help = 0;
my $requested_version = 0;

GetOptions("d|device=s"                   => \$device,
	   "t|tmpdir|tempdir=s"           => \$tmpdir,
	   "g|guess-indirects"            => \$guess_indirects,
	   "b|block-size|blocksize=s"     => \$fs_blocksize,
	   "h|help"                       => \$requested_help,
	   "V|version"                    => \$requested_version)
  or short_usage();

show_version() if $requested_version;
long_usage()   if $requested_help;

$device =~ s:^:/dev/: if (! -e $device) and $device !~ m:^/dev:;
$tmpdir ||= '.';
$tmpdir = '//' if $tmpdir eq '/'; # POSIX allows leading // to enter a different name space

if ($fs_blocksize !~ m/^(\d+)([mkbw]?)$/i) {
    warn "$0: block size must be numeric (with an optional multiplier)\n";
    short_usage();
}
else {
    $fs_blocksize = $1;
    if ($2 ne "") {
	my %hash = ('m' => (1024 * 1024),
		    'k' =>  1024,
		    'b' =>   512,
		    'w' =>     2);
	$fs_blocksize *= $hash{lc($2)};
    }
}

if ($fs_blocksize % 512 != 0) {
    warn "$0: the blocksize must be an exact multiple of 512\n";
    short_usage();
}

if (@ARGV) {
    foreach (@ARGV) { s/^\000$/-/ }
}
else {
    @ARGV = ('-');
}

my $euid = POSIX::geteuid();
print "not running as root -- unable to set ownership of recovered files\n"
  if $euid != 0;

Lsdel:
while (<>) {
    chomp;
    if    (m/^debugfs:  /) { next Lsdel }
    elsif (m/^ *Inode/)    { next Lsdel }
    else {
	s.^\s+..;		# remove leading white space
	s./\s*. .;		# turn the `N/N blocks' stuff into separate fields

	if (scalar(split()) != 11) {
	    print "ignoring unrecognised line: $_\n";
	    next Lsdel;
	}

	my ($inode, $owner, $mode, $size, $free_blocks, $total_blocks) = split(/\s+/, $_, 11);
	$mode = oct("0$mode");

	if (($mode & 0770000) != 0100000) {
	    printf "skipping non-regular file (inode=$inode, mode=0%o)\n", $mode;
	    next Lsdel;
	}

	if ($free_blocks < $total_blocks) {
	    print "can't recover inode $inode because some of its blocks have been re-used\n";
	    next Lsdel;
	}

	open STAT, "echo stat '<$inode>' | $debugfs $device |";
	my ($stat, $group, $block_count, $atime, $mtime, @blocks);
      Stat:
	while (defined ($stat = <STAT>)) {
	    chomp($stat);
	    foreach ($stat) {
		if    (m/^\s*$/)       { next Stat }
		elsif (m/^debugfs:/)   { next Stat }
		elsif (m/^file acl:/i) { next Stat }
		elsif (m/^fragment:/i) { next Stat }
		elsif (m/^[cd]time:/i) { next Stat }
		elsif (m/^links:\s*\d+\s*blockcount:\s*(\d+)/i) {
		    $block_count = $1 / ($fs_blocksize / 512);
		}
		elsif (m/^atime:\s*(0x[a-f0-9]+)/i) {
		    $atime = oct($1);
		}
		elsif (m/^mtime:\s*(0x[a-f0-9]+)/i) {
		    $mtime = oct($1);
		}
		elsif (m/^user:\s*\d+\s*group:\s*(\d+)\s*size:\s*(\d+)/i) {
		    $group = $1;
		    $size = $2;
		}
		elsif (m/^blocks:/i) {
		    while (defined ($stat = <STAT>)) {
			last Stat if $stat =~ m/^total:/i;
			push @blocks, split(/\s+/, $stat);
		    }
		}
		else {
		    print "ignoring inode with unrecognised debugfs stat output: $stat\n";
		    next Lsdel;
		}
	    }
	}
	close STAT;

	my $recovery_name = ("$tmpdir/e2rec.$$."
			     . (($device =~ m:^.*/(.*)$:) ? $1 : $device)
			     . ".$inode");

	if ($block_count < @blocks) {
	    print +("skipping inode $inode -- found ", scalar @blocks, " blocks, but ",
		    "told $block_count!\n");
	    next Lsdel;
	}
	elsif ($block_count == @blocks) {
	    # We found all the blocks that belonged to the file.  Of those,
	    # we can simply take all the data blocks and have run_fsgrab()
	    # write them out to our recovery file.
	    @blocks = remove_indirects ($fs_blocksize, \@blocks);
	    run_fsgrab ($recovery_name, $size, $device, $fs_blocksize, @blocks);
	    set_details $euid, $atime, $mtime, $mode, $owner, $group, $recovery_name;
	    print "probably recovered inode $inode to $recovery_name\n"
	}
	else {			# $block_count > @blocks
	    # damn, we have to hope for no fragmentation and guess at the indirect blocks
	    unless ($guess_indirects) {
		print +("need --guess-indirects mode to recover inode $inode (blocks: ",
			"counted ", scalar @blocks, ", told $block_count)\n");
		next Lsdel;
	    }

	    if (@blocks > 15) {
		print +("I can't make guesses about inode $inode (blocks: counted ",
			scalar @blocks, "(!), told $block_count)\n");
		next Lsdel;
	    }

	    my @guessed_blocks = ();

	    push @guessed_blocks, splice(@blocks, 0, 12);
	    my ($indirect, $indirect2, $indirect3) = splice(@blocks, 0, 3);

	    if (defined $indirect) {
		push_indirect \@guessed_blocks, $block_count, $fs_blocksize, $indirect;
	    }

	    if (defined $indirect2) {
		push_indirect2 \@guessed_blocks, $block_count, $fs_blocksize, $indirect2;
	    }

	    if (defined $indirect3) {
		push_indirect3 \@guessed_blocks, $block_count, $fs_blocksize, $indirect3;
	    }

	    # Truncate the list of guessed blocks to as many as we think we should have
	    splice @guessed_blocks, $block_count;

	    @guessed_blocks = remove_indirects ($fs_blocksize, \@guessed_blocks);
	    run_fsgrab ($recovery_name, $size, $device, $fs_blocksize, @guessed_blocks);
	    set_details $euid, $atime, $mtime, $mode, $owner, $group, $recovery_name;

	    if (@blocks == 0) {
		print "possibly recovered inode $inode to $recovery_name (using guesswork)\n";
		my $rec_size = -s $recovery_name;
		print "  oops: expected size $size, actually $rec_size for inode $inode\n"
		  if $size != $rec_size;
	    }
	    else {
		print +("I can't handle inodes with some indirect blocks ",
			"zeroed and some left untouched; $recovery_name is ",
			"probably garbled\n");
	    }
	}
    }
}
