=head1 NAME

EPrints::Plugin::Search::Xapian

=head1 PARAMETERS

=over 4

=item lang

Override the default language used for stemming.

=item stopwords

An array reference of stop words to use (defaults to English).

=back

=head1 METHODS

=over 4

=cut

package EPrints::Plugin::Search::Xapian;

use EPrints::Plugin::Search;

@ISA = qw( EPrints::Plugin::Search );

use strict;

# Freely available stopword list.  This stopword
# list provides a nice balance between coverage
# and size.
our @STOPWORDS = qw(
	a
	about
	above
	across
	after
	again
	against
	all
	almost
	alone
	along
	already
	also
	although
	always
	among
	an
	and
	another
	any
	anybody
	anyone
	anything
	anywhere
	are
	area
	areas
	around
	as
	ask
	asked
	asking
	asks
	at
	away
	b
	back
	backed
	backing
	backs
	be
	became
	because
	become
	becomes
	been
	before
	began
	behind
	being
	beings
	best
	better
	between
	big
	both
	but
	by
	c
	came
	can
	cannot
	case
	cases
	certain
	certainly
	clear
	clearly
	come
	could
	d
	did
	differ
	different
	differently
	do
	does
	done
	down
	down
	downed
	downing
	downs
	during
	e
	each
	early
	either
	end
	ended
	ending
	ends
	enough
	even
	evenly
	ever
	every
	everybody
	everyone
	everything
	everywhere
	f
	face
	faces
	fact
	facts
	far
	felt
	few
	find
	finds
	first
	for
	four
	from
	full
	fully
	further
	furthered
	furthering
	furthers
	g
	gave
	general
	generally
	get
	gets
	give
	given
	gives
	go
	going
	good
	goods
	got
	great
	greater
	greatest
	group
	grouped
	grouping
	groups
	h
	had
	has
	have
	having
	he
	her
	here
	herself
	high
	high
	high
	higher
	highest
	him
	himself
	his
	how
	however
	i
	if
	important
	in
	interest
	interested
	interesting
	interests
	into
	is
	it
	its
	itself
	j
	just
	k
	keep
	keeps
	kind
	knew
	know
	known
	knows
	l
	large
	largely
	last
	later
	latest
	least
	less
	let
	lets
	like
	likely
	long
	longer
	longest
	m
	made
	make
	making
	man
	many
	may
	me
	member
	members
	men
	might
	more
	most
	mostly
	mr
	mrs
	much
	must
	my
	myself
	n
	necessary
	need
	needed
	needing
	needs
	never
	new
	new
	newer
	newest
	next
	no
	nobody
	non
	noone
	not
	nothing
	now
	nowhere
	number
	numbers
	o
	of
	off
	often
	old
	older
	oldest
	on
	once
	one
	only
	open
	opened
	opening
	opens
	or
	order
	ordered
	ordering
	orders
	other
	others
	our
	out
	over
	p
	part
	parted
	parting
	parts
	per
	perhaps
	place
	places
	point
	pointed
	pointing
	points
	possible
	present
	presented
	presenting
	presents
	problem
	problems
	put
	puts
	q
	quite
	r
	rather
	really
	right
	right
	room
	rooms
	s
	said
	same
	saw
	say
	says
	second
	seconds
	see
	seem
	seemed
	seeming
	seems
	sees
	several
	shall
	she
	should
	show
	showed
	showing
	shows
	side
	sides
	since
	small
	smaller
	smallest
	so
	some
	somebody
	someone
	something
	somewhere
	state
	states
	still
	still
	such
	sure
	t
	take
	taken
	than
	that
	the
	their
	them
	then
	there
	therefore
	these
	they
	thing
	things
	think
	thinks
	this
	those
	though
	thought
	thoughts
	three
	through
	thus
	to
	today
	together
	too
	took
	toward
	turn
	turned
	turning
	turns
	two
	u
	under
	until
	up
	upon
	us
	use
	used
	uses
	v
	very
	w
	want
	wanted
	wanting
	wants
	was
	way
	ways
	we
	well
	wells
	went
	were
	what
	when
	where
	whether
	which
	while
	who
	whole
	whose
	why
	will
	with
	within
	without
	work
	worked
	working
	works
	would
	x
	y
	year
	years
	yet
	you
	young
	younger
	youngest
	your
	yours
	z
);

sub new
{
	my( $class, %params ) = @_;

	my $self = $class->SUPER::new( %params );

	$self->{name} = "xapian";
	$self->{search} = [qw( simple/* )];
	$self->{result_order} = 1; # whether to default to showing by engine result order
	if( defined $self->{session} )
	{
		$self->{lang} = $self->{session}->config( "defaultlanguage" );
	}
	$self->{stopwords} = \@STOPWORDS;

	$self->{disable} = !EPrints::Utils::require_if_exists( "Search::Xapian" );
	
	return $self;
}

=item $stemmer = $plugin->stemmer()

Returns a L<Search::Xapian::Stem> for the default language.

=cut

sub stemmer
{
	my( $self ) = @_;

	my $langid = $self->param( "lang" );

        my $stemmer;
        eval { $stemmer = Search::Xapian::Stem->new( $langid ); };
        if( $@ || UNIVERSAL::isa( $stemmer, "Search::Xapian::Error" ) )
        {
		$self->{session}->log( "'$langid' is not a supported Xapian stem language, using English instead" );
		$stemmer = Search::Xapian::Stem->new( 'en' );
	}

	return $stemmer;
}

=item $stopper = $plugin->stopper()

Returns a L<Search::Xapian::SimpleStopper> for C<stopwords>.

=cut

sub stopper
{
	my( $self ) = @_;

	return Search::Xapian::SimpleStopper->new(
			@{$self->param( "stopwords" )}
		);
}

sub from_cache
{
	my( $self ) = @_;

	return 0;
}

sub from_form
{
	my( $self ) = @_;

	$self->{q} = $self->{session}->param( $self->{basename}."q" );

	return ();
}

sub from_string_fields
{
	my( $self, $fields, %opts ) = @_;

	$self->{q} = @$fields ? (split /:/, $fields->[0], 5)[4] : "";
}

sub serialise_fields
{
	my( $self ) = @_;

	return( join ':',
		'q',
		'',
		'ALL',
		'IN',
		$self->{q} );
}

sub is_blank
{
	my( $self ) = @_;

	return !EPrints::Utils::is_set( $self->{q} );
}

sub clear
{
	my( $self ) = @_;

	undef $self->{q};
}

sub execute
{
	my( $self ) = @_;

	my $session = $self->{session};

	my $path = $session->config( "variables_path" ) . "/xapian";
	my $xapian = Search::Xapian::Database->new( $path );

	my $qp = Search::Xapian::QueryParser->new( $xapian );
	$qp->set_stemmer( $self->stemmer );
	$qp->set_stopper( $self->stopper );
	$qp->set_stemming_strategy( Search::Xapian::STEM_SOME() );
	$qp->set_default_op( Search::Xapian::OP_AND() );

	for(@{$self->{search_fields}->[0]->{meta_fields}})
	{
		$qp->add_prefix( $_, "$_:" );
	}

	my $query = Search::Xapian::Query->new( "_dataset:".$self->{dataset}->base_id );
	for(
		@{$self->{dataset}->{filters}||[]},
		@{$self->{filters}||[]},
	   )
	{
		my $fieldname = $_->{meta_fields}->[0];
		$query = Search::Xapian::Query->new(
			Search::Xapian::OP_AND(),
			$query,
			Search::Xapian::Query->new( $fieldname . ':' . $_->{value} )
		);
	}
	if( EPrints::Utils::is_set( $self->{q} ) )
	{
		$query = Search::Xapian::Query->new(
			Search::Xapian::OP_AND(),
			$query,
			$qp->parse_query( $self->{q},
				Search::Xapian::FLAG_PHRASE() |
				Search::Xapian::FLAG_BOOLEAN() | 
				Search::Xapian::FLAG_LOVEHATE() | 
				Search::Xapian::FLAG_WILDCARD()
			)
		);
	}
	my $enq = $xapian->enquire( $query );

	if( $self->{custom_order} )
	{
		my $sorter = Search::Xapian::MultiValueSorter->new;
		for(split /\//, $self->{custom_order})
		{
			next if $_ eq "byrelevance";
			my $reverse = $_ =~ s/^-// ? 1 : 0;
			my $key = join '.',
				$self->{dataset}->base_id,
				$_,
				$session->{lang}->get_id;
			my $idx = $xapian->get_metadata( $key );
			if( !length($idx) )
			{
				$session->log( "Search::Xapian can't sort by $key: unknown sort key" );
				next;
			}
			$sorter->add( $idx, $reverse );
		}
		$enq->set_sort_by_key_then_relevance( $sorter );
	}

	return EPrints::Plugin::Search::Xapian::ResultSet->new(
		session => $session,
		dataset => $self->{dataset},
		enq => $enq,
		count => $enq->get_mset( 0, $xapian->get_doccount )->get_matches_estimated,
		ids => [],
		limit => $self->{limit} );
}

sub render_description
{
	my( $self ) = @_;

	return $self->{session}->make_text( $self->{q} );
}

sub render_conditions_description
{
	my( $self ) = @_;

	return $self->html_phrase( 'results:title', 
		q => $self->{repository}->make_text( $self->{q} ) 
	);
}

package EPrints::Plugin::Search::Xapian::ResultSet;

our @ISA = qw( EPrints::List );

sub _get_records
{
	my( $self, $offset, $size, $ids_only ) = @_;

	$offset = 0 if !defined $offset;

	if( defined $self->{limit} )
	{
		if( $offset > $self->{limit} )
		{
			$size = 0;
		}
		elsif( !defined $size || $offset+$size > $self->{limit} )
		{
			$size = $self->{limit} - $offset;
		}
	}

	my @ids;
	if( defined $size )
	{
		@ids = grep { length($_) } map { $_->get_document->get_data } $self->{enq}->matches( $offset, $size );
	}
	else
	{
		# retrieve matches 1000 ids at a time
		my $num_results = -1;
		while( $num_results != scalar @ids )
		{
			$num_results = scalar @ids;
			push @ids, grep { length($_) } map { $_->get_document->get_data } $self->{enq}->matches( $offset, 1000 );
			$offset += 1000;

		}
	}

	return $ids_only ? \@ids : $self->{session}->get_database->get_dataobjs( $self->{dataset}, @ids );
}

sub count
{
	my( $self ) = @_;

	return (defined $self->{limit} && $self->{limit} < $self->{count}) ?
		$self->{limit} :
		$self->{count};
}

sub reorder
{
	my( $self, $new_order ) = @_;

	$self->{ids} = $self->ids;

	return $self->SUPER::reorder( $new_order );
}

1;

=back

=head1 COPYRIGHT

=for COPYRIGHT BEGIN

Copyright 2022 University of Southampton.
EPrints 3.4 is supplied by EPrints Services.

http://www.eprints.org/eprints-3.4/

=for COPYRIGHT END

=for LICENSE BEGIN

This file is part of EPrints 3.4 L<http://www.eprints.org/>.

EPrints 3.4 and this file are released under the terms of the
GNU Lesser General Public License version 3 as published by
the Free Software Foundation unless otherwise stated.

EPrints 3.4 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with EPrints 3.4.
If not, see L<http://www.gnu.org/licenses/>.

=for LICENSE END

