[Patches] [PATCH] Bug 5166 - Reintroduce a zebraqueue daemon, make several rebuild_zebra.pl functions a lib

koha-patchbot at kohaaloha.com koha-patchbot at kohaaloha.com
Thu Dec 22 03:05:03 NZDT 2011


From: Tomas Cohen Arazi <tomascohen at gmail.com>
Date: Fri, 25 Nov 2011 19:14:12 -0300
Subject: [PATCH] Bug 5166 - Reintroduce a zebraqueue daemon, make several rebuild_zebra.pl functions a lib

Refactorized. Removed per-language databases scripts. Created atomicupdate file to
be introduced later in updatedatabase.pl by the QA/RM.

Is was refactored against origin/master.

Anything I could do to help on this bug just let me know so there are no further delays.
Regards
To+
---
 C4/Catalog/Zebra.pm                                |  474 ++++++++++++++++++
 .../bug_5166-zebraqueuedaemon_is_back.pl           |   14 +
 installer/data/mysql/sysprefs.sql                  |    5 +
 .../en/modules/admin/preferences/searching.pref    |   33 ++
 misc/bin/koha-zebraqueue-ctl.sh                    |    6 +-
 misc/bin/zebraqueue_daemon.pl                      |  518 +++-----------------
 6 files changed, 604 insertions(+), 446 deletions(-)
 create mode 100644 C4/Catalog/Zebra.pm
 create mode 100644 installer/data/mysql/atomicupdate/bug_5166-zebraqueuedaemon_is_back.pl

diff --git a/C4/Catalog/Zebra.pm b/C4/Catalog/Zebra.pm
new file mode 100644
index 0000000..458694c
--- /dev/null
+++ b/C4/Catalog/Zebra.pm
@@ -0,0 +1,474 @@
+package C4::Catalog::Zebra;
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA  02111-1307 USA
+
+# Derived from rebuild_zebra.pl (2005-08-11) Paul Poulain and others
+# Rewriten 02/03/2011 by Tomas Cohen Arazi (tomascohen at gmail.com)
+#                      Universidad Nacional de Cordoba / Argentina
+
+# Library for managing updates in zebra, usually from zebraqueue
+
+use strict;
+use warnings;
+use C4::Context;
+use Getopt::Long;
+use File::Temp qw/ tempdir /;
+use File::Path;
+use Time::HiRes qw(time);
+use C4::Biblio;
+use C4::AuthoritiesMarc;
+
+use vars qw($VERSION @ISA @EXPORT);
+
+BEGIN {
+	# set the version for version checking
+	$VERSION = 0.01;
+
+	require Exporter;
+	@ISA = qw(Exporter);
+	@EXPORT = qw(
+		&UpdateAuths
+		&UpdateBiblios
+		&UpdateAuthsAndBiblios
+		&IndexZebraqueueRecords
+	);
+}
+
+
+=head1 NAME
+
+C4::Catalog::Zebra
+
+Comment:
+	This should be used when merging the rest of the rebuild_zebra.pl indexing logic
+	my $nosanitize				= (C4::Context->preference('ZebraNoSanitize')) ? 1 : 0;
+
+
+=head2 UpdateAuths
+
+  ( $num_records_updated ) = &UpdateAuths ();
+
+returns the number of updated+deleted authority records 
+
+=cut
+
+sub UpdateAuths
+{
+	# Update authorities
+	return IndexZebraqueueRecords('authority');
+}
+
+=head2 UpdateBiblios
+
+  ( $num_records_updated ) = &UpdateBiblios ();
+
+returns the number of updated+deleted biblio records 
+
+=cut
+
+sub UpdateBiblios
+{
+	# Update authorities
+	return IndexZebraqueueRecords('biblio');
+}
+
+=head2 UpdateAuthsAndBiblios
+
+  ( $num_records_updated ) = &UpdateAuthsAndBiblios ();
+
+returns the number of updated+deleted authority and biblio records 
+
+=cut
+
+sub UpdateAuthsAndBiblios
+{
+	my $ret;
+	# Update authorities
+	$ret = UpdateAuths();
+
+	# Update biblios
+	$ret += UpdateBiblios();
+
+	return $ret;
+}
+
+=head2 IndexZebraqueueRecords
+
+  ( $num_records_updated ) = &IndexZebraqueueRecords ($record_type);
+
+returns the number of updated+deleted $record_type records 
+
+Comment :
+$record_type can be either 'biblio' or 'authority'
+
+=cut
+
+sub IndexZebraqueueRecords
+{
+	my ($record_type) = @_;
+	my $as_xml			= (C4::Context->preference('ZebraUseXml')) ? 1 : 0;
+	my $noxml			= ($as_xml) ? 0 : 1;
+	my $record_format	= ($as_xml) ? 'marcxml' : 'iso2709' ;
+
+	my ($num_records_updated,$num_records_deleted);
+
+	$num_records_deleted = (IndexZebraqueueByAction('deleted',$record_type,$record_format,$as_xml,$noxml)||0);
+	$num_records_updated = (IndexZebraqueueByAction('updated',$record_type,$record_format,$as_xml,$noxml)||0);
+
+	return $num_records_deleted + $num_records_updated;
+}
+
+=head2 IndexZebraqueueByAction
+
+  ( $num_records_updated ) = &IndexZebraqueueByAction ($action,$record_type,
+														$record_format,$as_xml,$noxml);
+
+returns the number of updated+deleted $record_type records 
+
+Comment :
+$record_type can be 'biblio' or 'authority'
+$record_format can be 'marcxml' or 'iso2709'
+$action can be 'updated' or 'deleted'
+$as_xml and $noxml are maintained for legacy reasons, one is enough. They
+indicate whether to use marcxml for indexing in zebra or iso2709. They should
+all be deduced from C4::Context->preference('ZebraUseXml').
+
+=cut
+
+sub IndexZebraqueueByAction
+{
+	my ($action,$record_type,$record_format,$as_xml,$noxml) = @_;
+	my ($num_records_exported,$ret,$zaction);
+
+	if ($action eq 'updated' or $action eq 'deleted') {
+		# get records by action
+		my $entries = select_zebraqueue_records($record_type, $action);
+		# Create tmp dir
+		my $directory = File::Temp->newdir();
+
+		# get records from zebraqueue, export to file for zebraidx
+		if ($action eq 'updated') {
+			$zaction = 'update';
+			$num_records_exported = export_marc_records_from_list($record_type, 
+												$entries, "$directory", $as_xml, $noxml);
+		} else {	
+			# $action eq 'deleted'
+			$zaction = 'delete';
+			$num_records_exported = generate_deleted_marc_records($record_type,
+												$entries, "$directory", $as_xml);
+		}
+
+		if ($num_records_exported) {
+			# log export
+			my $time = localtime(time);
+			print "$time $num_records_exported $record_type record(s) exported for $zaction\n";
+			# TODO error handling / and better logging
+			$ret = DoIndexing($record_type,$zaction,"$directory",$record_format);
+			if ($ret) {
+				print "$time $num_records_exported $record_type record(s) $action\n";
+				mark_zebraqueue_batch_done($entries);
+				print "$time $num_records_exported $record_type record(s) marked done in zebraqueue\n";
+			}
+			# /TODO
+		}
+	} else {
+		# Wrong action
+		$ret = -1;
+	}
+
+	return $ret;
+}
+
+
+sub select_zebraqueue_records {
+	my ($record_type, $update_type) = @_;
+
+	my $dbh = C4::Context->dbh;
+	my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+	my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';
+
+	my $sth = $dbh->prepare(<<'SQL');
+		SELECT id, biblio_auth_number 
+		FROM zebraqueue
+		WHERE server = ?
+		AND   operation = ?
+		AND   done = 0
+		ORDER BY id DESC;
+SQL
+
+	$sth->execute($server, $op);
+	my $entries = $sth->fetchall_arrayref({});
+}
+
+sub mark_zebraqueue_batch_done {
+	my ($entries) = @_;
+
+	my $dbh = C4::Context->dbh;
+
+	$dbh->{AutoCommit} = 0;
+	my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ?");
+	$dbh->commit();
+	foreach my $id (map { $_->{id} } @$entries) {
+		$sth->execute($id);
+	}
+	$dbh->{AutoCommit} = 1;
+}
+
+sub export_marc_records_from_list {
+	my ($record_type, $entries, $directory, $as_xml, $noxml) = @_;
+	my $verbose_logging = (C4::Context->preference('ZebraqueueVerboseLogging')) ? 1 : 0;
+
+	my $num_exported = 0;
+	open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+	my $i = 0;
+	my %found = ();
+	foreach my $record_number ( map { $_->{biblio_auth_number} }
+								grep { !$found{ $_->{biblio_auth_number} }++ }
+								@$entries ) {
+		print "." if ( $verbose_logging );
+		print "\r$i" unless ($i++ %100 or !$verbose_logging);
+		my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+		if (defined $marc) {
+			# FIXME - when more than one record is exported and $as_xml is true,
+			# the output file is not valid XML - it's just multiple <record> elements
+			# strung together with no single root element.  zebraidx doesn't seem
+			# to care, though, at least if you're using the GRS-1 filter.  It does
+			# care if you're using the DOM filter, which requires valid XML file(s).
+			print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+			$num_exported++;
+		}
+	}
+	print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+	close OUT;
+	return $num_exported;
+}
+
+sub generate_deleted_marc_records {
+	my ($record_type, $entries, $directory, $as_xml) = @_;
+	my $verbose_logging = (C4::Context->preference('ZebraqueueVerboseLogging')) ? 1 : 0;
+
+	my $num_exported = 0;
+	open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+	my $i = 0;
+	foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
+		print "\r$i" unless ($i++ %100 or !$verbose_logging);
+		print "." if ( $verbose_logging );
+
+		my $marc = MARC::Record->new();
+		if ($record_type eq 'biblio') {
+			fix_biblio_ids($marc, $record_number, $record_number);
+		} else {
+			fix_authority_id($marc, $record_number);
+		}
+		if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+			fix_unimarc_100($marc);
+		}
+
+		print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+		$num_exported++;
+	}
+	print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+	close OUT;
+	return $num_exported;
+}
+
+sub get_corrected_marc_record {
+	my ($record_type, $record_number, $noxml) = @_;
+
+	my $marc = get_raw_marc_record($record_type, $record_number, $noxml); 
+
+	if (defined $marc) {
+		fix_leader($marc);
+		if ($record_type eq 'biblio') {
+			my $succeeded = fix_biblio_ids($marc, $record_number);
+			return unless $succeeded;
+		} else {
+			fix_authority_id($marc, $record_number);
+		}
+		if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+			fix_unimarc_100($marc);
+		}
+	}
+
+	return $marc;
+}
+
+sub get_raw_marc_record {
+	my ($record_type, $record_number, $noxml) = @_;
+	my $dbh = C4::Context->dbh;
+
+	my $marc; 
+	if ($record_type eq 'biblio') {
+		if ($noxml) {
+			my $fetch_sth = $dbh->prepare_cached("SELECT marc FROM biblioitems WHERE biblionumber = ?");
+			$fetch_sth->execute($record_number);
+			if (my ($blob) = $fetch_sth->fetchrow_array) {
+				$marc = MARC::Record->new_from_usmarc($blob);
+				$fetch_sth->finish();
+			} else {
+				return; # failure to find a bib is not a problem -
+						# a delete could have been done before
+						# trying to process a record update
+			}
+		} else {
+			eval { $marc = GetMarcBiblio($record_number,1); };
+			if ($@) {
+				# here we do warn since catching an exception
+				# means that the bib was found but failed
+				# to be parsed
+				warn "error retrieving biblio $record_number";
+				return;
+			}
+		}
+	} else {
+		eval { $marc = GetAuthority($record_number); };
+		if ($@) {
+			warn "error retrieving authority $record_number";
+			return;
+		}
+	}
+	return $marc;
+}
+
+sub fix_leader {
+    # FIXME - this routine is suspect
+    # It blanks the Leader/00-05 and Leader/12-16 to
+    # force them to be recalculated correct when
+    # the $marc->as_usmarc() or $marc->as_xml() is called.
+    # But why is this necessary?  It would be a serious bug
+    # in MARC::Record (definitely) and MARC::File::XML (arguably) 
+    # if they are emitting incorrect leader values.
+    my $marc = shift;
+
+    my $leader = $marc->leader;
+    substr($leader,  0, 5) = '     ';
+    substr($leader, 10, 7) = '22     ';
+    $marc->leader(substr($leader, 0, 24));
+}
+
+sub fix_biblio_ids {
+	# FIXME - it is essential to ensure that the biblionumber is present,
+	#         otherwise, Zebra will choke on the record.  However, this
+	#         logic belongs in the relevant C4::Biblio APIs.
+	my $marc = shift;
+	my $biblionumber = shift;
+	my $biblioitemnumber;
+	my $dbh = C4::Context->dbh;
+
+	if (@_) {
+		$biblioitemnumber = shift;
+	} else {    
+		my $sth = $dbh->prepare(
+			"SELECT biblioitemnumber FROM biblioitems WHERE biblionumber=?");
+		$sth->execute($biblionumber);
+		($biblioitemnumber) = $sth->fetchrow_array;
+		$sth->finish;
+		unless ($biblioitemnumber) {
+			warn "failed to get biblioitemnumber for biblio $biblionumber";
+			return 0;
+		}
+	}
+
+	# FIXME - this is cheating on two levels
+	# 1. C4::Biblio::_koha_marc_update_bib_ids is meant to be an internal function
+	# 2. Making sure that the biblionumber and biblioitemnumber are correct and
+	#    present in the MARC::Record object ought to be part of GetMarcBiblio.
+	#
+	# On the other hand, this better for now than what rebuild_zebra.pl used to
+	# do, which was duplicate the code for inserting the biblionumber 
+	# and biblioitemnumber
+	C4::Biblio::_koha_marc_update_bib_ids($marc, '', $biblionumber, $biblioitemnumber);
+
+	return 1;
+}
+
+sub fix_authority_id {
+	# FIXME - as with fix_biblio_ids, the authid must be present
+	#         for Zebra's sake.  However, this really belongs
+	#         in C4::AuthoritiesMarc.
+	my ($marc, $authid) = @_;
+	unless ($marc->field('001') and $marc->field('001')->data() eq $authid){
+		$marc->delete_field($marc->field('001'));
+		$marc->insert_fields_ordered(MARC::Field->new('001',$authid));
+	}
+}
+
+sub fix_unimarc_100 {
+	# FIXME - again, if this is necessary, it belongs in C4::AuthoritiesMarc.
+	my $marc = shift;
+
+	my $string;
+	if ( length($marc->subfield( 100, "a" )) == 35 ) {
+		$string = $marc->subfield( 100, "a" );
+		my $f100 = $marc->field(100);
+		$marc->delete_field($f100);
+	}
+	else {
+		$string = POSIX::strftime( "%Y%m%d", localtime );
+		$string =~ s/\-//g;
+		$string = sprintf( "%-*s", 35, $string );
+	}
+	substr( $string, 22, 6, "frey50" );
+	unless ( length($marc->subfield( 100, "a" )) == 35 ) {
+		$marc->delete_field($marc->field(100));
+		$marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string ));
+	}
+}
+
+=head2 DoIndexing
+
+  ( $error_code ) = &DoIndexing($record_type,$op,$record_dir,$record_format);
+
+returns the corresponding zebraidx error code
+
+Comment :
+$record_type can be 'biblio' or 'authority'
+$zaction can be 'delete' or 'update'
+$record_dir is the directory where the exported records are
+$record_format can be 'marcxml' or 'iso2709'
+
+=cut
+
+sub DoIndexing {
+	my ($record_type, $zaction, $record_dir, $record_format) = @_;
+	my $zebra_server	= ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+	my $zebra_db_name	= ($record_type eq 'biblio') ? 'biblios' : 'authorities';
+	my $zebra_config	= C4::Context->zebraconfig($zebra_server)->{'config'};
+	my $zebra_db_dir	= C4::Context->zebraconfig($zebra_server)->{'directory'};
+	my $noshadow		= (C4::Context->preference('ZebraNoshadow')) ? '-n' : '';
+	my $zebraidx_log_opt		= " -v none,fatal ";
+
+	# TODO better error handling!!
+	system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $zaction $record_dir");
+	system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
+	# /TODO
+	
+	return 1;
+}
+
+
+END { }
+
+1;
+__END__
+
+=head1 AUTHOR
+
+Koha Development Team <http://koha-community.org/>
+
+Tomas Cohen Arazi tomascohen at gmail.com
+
+=cut
diff --git a/installer/data/mysql/atomicupdate/bug_5166-zebraqueuedaemon_is_back.pl b/installer/data/mysql/atomicupdate/bug_5166-zebraqueuedaemon_is_back.pl
new file mode 100644
index 0000000..e7e92df
--- /dev/null
+++ b/installer/data/mysql/atomicupdate/bug_5166-zebraqueuedaemon_is_back.pl
@@ -0,0 +1,14 @@
+#! /usr/bin/perl
+use strict;
+use warnings;
+use C4::Context;
+my $dbh=C4::Context->dbh;
+
+
+$dbh->do("INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraUseXml','1','Tell Zebra to use MARCXML instead of ISO2907 for indexing. Very important for libraries with records bigger than the allowed by ISO2907 (e.g. with lots of items in a single record).',NULL,'YesNo')");
+$dbh->do("INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraNoshadow','0','Tell Zebra to use shadow records when updating. Prevents locking on records while updating the database. Refer to zebra documentation for more info on the drawbacks.',NULL,'YesNo')");
+$dbh->do("INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraqueueVerboseLogging','0','Tell zebraqueue daemon to be more verbose on logging.',NULL,'YesNo')");
+$dbh->do("INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraBiblioUpdateRatio','6','By default, tell zebraqueue daemon to search for updates every ZebraBiblioUpdateRatio*ZebraAuthUpdateRatio seconds in the biblios database.',NULL,'Integer')");
+$dbh->do("INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraAuthUpdateRatio','10','By default, tell zebraqueue daemon to search for updates every ZebraAuthUpdateRatio seconds in the authorities  database.',NULL,'Integer')");
+
+print "Upgrade done (Add sysprefs to control zebraqueue_daemon scripts: ZebraUseXml, ZebraNoshadow, ZebraqueueVerboseLogging, ZebraBiblioUpdateRatio, ZebraAuthUpdateRatio)\n";
diff --git a/installer/data/mysql/sysprefs.sql b/installer/data/mysql/sysprefs.sql
index 128c9b2..465c14f 100755
--- a/installer/data/mysql/sysprefs.sql
+++ b/installer/data/mysql/sysprefs.sql
@@ -328,4 +328,9 @@ INSERT INTO systempreferences (variable,value,explanation,options,type) VALUES('
 INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('OpacKohaUrl','1',"Show 'Powered by Koha' text on OPAC footer.",NULL,NULL);
 INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES('EasyAnalyticalRecords','0','If on, display in the catalogue screens tools to easily setup analytical record relationships','','YesNo');
 INSERT INTO systempreferences (variable,value,explanation,options,type) VALUES('OpacShowRecentComments',0,'If ON a link to recent comments will appear in the OPAC masthead',NULL,'YesNo');
+INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraUseXml','1','Tell Zebra to use MARCXML instead of ISO2907 for indexing. Very important for libraries with records bigger than the allowed by ISO2907 (e.g. with lots of items in a single record).',NULL,'YesNo');
+INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraNoshadow','0','Tell Zebra to use shadow records when updating. Prevents locking on records while updating the database. Refer to zebra documentation for more info on the drawbacks.',NULL,'YesNo');
+INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraqueueVerboseLogging','0','Tell zebraqueue daemon to be more verbose on logging.',NULL,'YesNo');
+INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraBiblioUpdateRatio','6','By default, tell zebraqueue daemon to search for updates every ZebraBiblioUpdateRatio*ZebraAuthUpdateRatio seconds in the biblios database.',NULL,'Integer');
+INSERT INTO `systempreferences` (variable,value,explanation,options,type) VALUES ('ZebraAuthUpdateRatio','10','By default, tell zebraqueue daemon to search for updates every ZebraAuthUpdateRatio seconds in the authorities database.',NULL,'Integer');
 
diff --git a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/searching.pref b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/searching.pref
index 12075ec..ab62c5c 100644
--- a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/searching.pref
+++ b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/searching.pref
@@ -69,6 +69,39 @@ Searching:
                   yes: Include
                   no: "Don't include"
             - subdivisions for searches generated by clicking on subject tracings.
+        -
+            - pref: ZebraUseXml
+              type: boolean
+              choices:
+                  no: "Don't use"
+                  yes: Use
+            - MARCXML instead of ISO2907 for indexing in Zebra. Very important for libraries with records bigger than the allowed by ISO2907 (e.g. with lots of items in a single record).
+        -
+            - pref: ZebraNoshadow
+              type: boolean
+              choices:
+                  yes: "Don't use"
+                  no: Use
+            - shadow records when updating Zebra indexes. Prevents locking on records while updating the database. Refer to zebra documentation for more info on the drawbacks.
+        -
+            - Set to
+            - pref: ZebraAuthUpdateRatio
+              class: integer
+              default: 10
+            - seconds the interval for searching authority record updates.
+        -
+            - Set to
+            - pref: ZebraBiblioUpdateRatio
+              class: integer
+              default: 6
+            - x ZebraAuthUpdateRatio seconds the interval for searching record updates.
+        -
+            - pref: ZebraqueueVerboseLogging
+              type: boolean
+              choices:
+                no: "Don't use"
+                yes: Use
+            -  verbose logging in zebraqueue daemon.
     Search Form:
         -
             - Show checkboxes to search by
diff --git a/misc/bin/koha-zebraqueue-ctl.sh b/misc/bin/koha-zebraqueue-ctl.sh
index d3444f0..4f3b26b 100755
--- a/misc/bin/koha-zebraqueue-ctl.sh
+++ b/misc/bin/koha-zebraqueue-ctl.sh
@@ -23,15 +23,15 @@ fi
 case "$1" in
     start)
       echo "Starting Zebraqueue Daemon"
-      daemon --name=$NAME --errlog=$ERRLOG --stdout=$STDOUT --output=$OUTPUT --verbose=1 --respawn --delay=30 $OTHERUSER -- perl -I $PERL5LIB $ZEBRAQUEUE -f $KOHA_CONF 
+      daemon --name=$NAME --errlog=$ERRLOG --stdout=$STDOUT --output=$OUTPUT --verbose=1 --respawn --delay=30 $OTHERUSER -- perl -I $PERL5LIB $ZEBRAQUEUE
       ;;
     stop)
       echo "Stopping Zebraqueue Daemon"
-      daemon --name=$NAME --errlog=$ERRLOG --stdout=$STDOUT --output=$OUTPUT --verbose=1 --respawn --delay=30 $OTHERUSER --stop -- perl -I $PERL5LIB $ZEBRAQUEUE -f $KOHA_CONF 
+      daemon --name=$NAME --errlog=$ERRLOG --stdout=$STDOUT --output=$OUTPUT --verbose=1 --respawn --delay=30 $OTHERUSER --stop -- perl -I $PERL5LIB $ZEBRAQUEUE
       ;;
     restart)
       echo "Restarting the Zebraqueue Daemon"
-      daemon --name=$NAME --errlog=$ERRLOG --stdout=$STDOUT --output=$OUTPUT --verbose=1 --respawn --delay=30 $OTHERUSER --restart -- perl -I $PERL5LIB $ZEBRAQUEUE -f $KOHA_CONF 
+      daemon --name=$NAME --errlog=$ERRLOG --stdout=$STDOUT --output=$OUTPUT --verbose=1 --respawn --delay=30 $OTHERUSER --restart -- perl -I $PERL5LIB $ZEBRAQUEUE
       ;;
     *)
       echo "Usage: /etc/init.d/$NAME {start|stop|restart}"
diff --git a/misc/bin/zebraqueue_daemon.pl b/misc/bin/zebraqueue_daemon.pl
index 6181a94..94525d7 100755
--- a/misc/bin/zebraqueue_daemon.pl
+++ b/misc/bin/zebraqueue_daemon.pl
@@ -1,475 +1,107 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
 
-# daemon to watch the zebraqueue and update zebra as needed
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA  02111-1307 USA
+
+# Writen 02/03/2011 by Tomas Cohen Arazi (tomascohen at gmail.com)
+#                      Universidad Nacional de Cordoba / Argentina
+
+# Daemon to watch the zebraqueue table and update zebra indexes as needed
 
 use strict;
-#use warnings; FIXME - Bug 2505
 BEGIN {
     # find Koha's Perl modules
     # test carefully before changing this
     use FindBin;
-    eval { require "$FindBin::Bin/kohalib.pl" };
+    eval { require "$FindBin::Bin/../kohalib.pl" };
 }
-
-use POE qw(Wheel::SocketFactory Wheel::ReadWrite Filter::Stream Driver::SysRW);
-use Unix::Syslog qw(:macros);
-
+use POE;
+use Time::HiRes qw(time);
 use C4::Context;
-use C4::Biblio;
-use C4::Search;
-use C4::AuthoritiesMarc;
-use XML::Simple;
-use POSIX;
-use utf8;
-
-
-# wait periods governing connection attempts
-my $min_connection_wait =    1; # start off at 1 second
-my $max_connection_wait = 1024; # max about 17 minutes
-
-# keep separate wait period for bib and authority Zebra databases
-my %zoom_connection_waits = (); 
-
-my $db_connection_wait = $min_connection_wait;
-
-# ZOOM and Z39.50 errors that are potentially
-# resolvable by connecting again and retrying
-# the operation
-my %retriable_zoom_errors = (
-    10000 => 'ZOOM_ERROR_CONNECT',
-    10001 => 'ZOOM_ERROR_MEMORY',
-    10002 => 'ZOOM_ERROR_ENCODE',
-    10003 => 'ZOOM_ERROR_DECODE',
-    10004 => 'ZOOM_ERROR_CONNECTION_LOST',
-    10005 => 'ZOOM_ERROR_INIT',
-    10006 => 'ZOOM_ERROR_INTERNAL',
-    10007 => 'ZOOM_ERROR_TIMEOUT',
-);
-
-# structure to store updates that have
-# failed and are to be retrieved.  The
-# structure is a hashref of hashrefs, 
-# e.g.,
-#
-# $postoned_updates->{$server}->{$record_number} = 1;
-#
-# If an operation is attempted and fails because
-# of a retriable error (see above), the daemon
-# will try several times to recover as follows:
-#
-# 1. close and reopen the connection to the
-#    Zebra server, unless the error was a timeout,
-#    in which case
-# 2. retry the operation
-#
-# If, after trying this five times, the operation still
-# fails, the daemon will mark the record number as
-# postponed, and try to process other entries in 
-# zebraqueue.  When an update is postponed, the 
-# error will be reported to syslog. 
-#
-# If more than 100 postponed updates are 
-# accumulated, the daemon will assume that 
-# something is seriously wrong, complain loudly,
-# and abort.  If running under the daemon(1) command, 
-# this means that the daemon will respawn.
-#
-my $num_postponed_updates = 0;
-my $postponed_updates = {};
-
-my $max_operation_attempts =   5;
-my $max_postponed_updates  = 100;
-
-# Zebra connection timeout
-my $zconn_timeout            =  30;
-my $zconn_timeout_multiplier = 1.5;
-my $max_zconn_timeout        = 120;
-
-my $ident = "Koha Zebraqueue ";
-
-my $debug = 0;
-Unix::Syslog::openlog $ident, LOG_PID, LOG_LOCAL0;
-
-Unix::Syslog::syslog LOG_INFO, "Starting Zebraqueue log at " . scalar localtime(time) . "\n";
-
-sub handler_start {
-
-    # Starts session. Only ever called once only really used to set an alias
-    # for the POE kernel
-    my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];
-
-    my $time = localtime(time);
-    Unix::Syslog::syslog LOG_INFO, "$time POE Session ", $session->ID, " has started.\n";
-
-    # check status
-#    $kernel->yield('status_check');
-    $kernel->yield('sleep');
-}
-
-sub handler_sleep {
-
-    # can be used to slow down loop execution if needed
-    my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];
-    use Time::HiRes qw (sleep);
-    Time::HiRes::sleep(0.5);
-    #sleep 1;
-    $kernel->yield('status_check');
-}
-
-sub handler_check {
-    # check if we need to do anything, at the moment just checks the zebraqueue, it could check other things too
-    my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];
-    my $dbh = get_db_connection();
-    my $sth = $dbh->prepare("SELECT count(*) AS opcount FROM zebraqueue WHERE done = 0");
-    $sth->execute;
-    my $data = $sth->fetchrow_hashref();
-    if ($data->{'opcount'} > 0) {
-        Unix::Syslog::syslog LOG_INFO, "$data->{'opcount'} operations waiting to be run\n";
-        $sth->finish();
-        $dbh->commit(); # needed so that we get current state of zebraqueue next time
-                        # we enter handler_check
-        $kernel->yield('do_ops');
-    }
-    else {
-        $sth->finish();
-        $dbh->commit(); # needed so that we get current state of zebraqueue next time
-                        # we enter handler_check
-        $kernel->yield('sleep');
-    }
-}
-
-sub zebraop {
-    # execute operations waiting in the zebraqueue
-    my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];
-    my $dbh = get_db_connection();
-    my $readsth = $dbh->prepare("SELECT id, biblio_auth_number, operation, server FROM zebraqueue WHERE done = 0 ORDER BY id DESC");
-    $readsth->execute();
-    Unix::Syslog::syslog LOG_INFO, "Executing zebra operations\n";
-
-    my $completed_updates = {};
-    ZEBRAQUEUE: while (my $data = $readsth->fetchrow_hashref()) {
-        warn "Inside while loop" if $debug;
-
-        my $id = $data->{'id'};
-        my $op = $data->{'operation'};
-        $op = 'recordDelete' if $op =~ /delete/i; # delete ops historically have been coded
-                                                  # either delete_record or recordDelete
-        my $record_number = $data->{'biblio_auth_number'};
-        my $server = $data->{'server'};
-
-        next ZEBRAQUEUE if exists $postponed_updates->{$server}->{$record_number};
-        next ZEBRAQUEUE if exists $completed_updates->{$server}->{$record_number}->{$op};
-
-        my $ok = 0;
-        my $record;
-        if ($op eq 'recordDelete') {
-            $ok = process_delete($dbh, $server, $record_number);
-        }
-        else {
-            $ok = process_update($dbh, $server, $record_number, $id);
-        }
-        if ($ok == 1) {
-            mark_done($dbh, $record_number, $op, $server);
-            $completed_updates->{$server}->{$record_number}->{$op} = 1;
-            if ($op eq 'recordDelete') {
-                $completed_updates->{$server}->{$record_number}->{'specialUpdate'} = 1;
-            }
-        }                            
-    }
-    $readsth->finish();
-    $dbh->commit();
-    $kernel->yield('sleep');
-}
-
-sub process_delete {
-    my $dbh = shift;
-    my $server = shift;
-    my $record_number = shift;
-
-    my $record;
-    my $ok = 0;
-    eval {
-        warn "Searching for record to delete" if $debug;
-        # 1st read the record in zebra, we have to get it from zebra as its no longer in the db
-        my $Zconn =  get_zebra_connection($server);
-        my $results = $Zconn->search_pqf( '@attr 1=Local-number '.$record_number);
-        $results->option(elementSetName => 'marcxml');
-        $record = $results->record(0)->raw();
-    };
-    if ($@) {
-        # this doesn't exist, so no need to wail on zebra to delete it
-        if ($@->code() eq 13) {
-            $ok = 1;
-        } else {
-            # caught a ZOOM::Exception
-            my $message = _format_zoom_error_message($@);
-            postpone_update($server, $record_number, $message);
-        }
-    } else {
-        # then, delete the record
-        warn "Deleting record" if $debug;
-        $ok = zebrado($record, 'recordDelete', $server, $record_number);
-    }
-    return $ok;
-}
-
-sub process_update {
-    my $dbh = shift;
-    my $server = shift;
-    my $record_number = shift;
-    my $id = shift;
-
-    my $record;
-    my $ok = 0;
-
-    warn "Updating record" if $debug;
-    # get the XML
-    my $marcxml;
-    if ($server eq "biblioserver") {
-        my $marc = GetMarcBiblio($record_number);
-        $marcxml = $marc->as_xml_record() if $marc;
-    } 
-    elsif ($server eq "authorityserver") {
-        $marcxml = C4::AuthoritiesMarc::GetAuthorityXML($record_number);
-    }
-    # check it's XML, just in case
-    eval {
-        my $hashed = XMLin($marcxml);
-    }; ### is it a proper xml? broken xml may crash ZEBRA- slow but safe
-    ## it's Broken XML-- Should not reach here-- but if it does -lets protect ZEBRA
-    if ($@) {
-        Unix::Syslog::syslog LOG_ERR, "$server record $record_number is malformed: $@";
-        mark_done_by_id($dbh, $id, $server);
-        $ok = 0;
-    } else {
-        # ok, we have everything, do the operation in zebra !
-        $ok = zebrado($marcxml, 'specialUpdate', $server, $record_number);
-    }
-    return $ok;
-}
+use C4::Catalog::Zebra;
 
-sub mark_done_by_id {
-    my $dbh = shift;
-    my $id = shift;
-    my $server = shift;
-    my $delsth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ? AND server = ? AND done = 0");
-    $delsth->execute($id, $server);
-}
-
-sub mark_done {
-    my $dbh = shift;
-    my $record_number = shift;
-    my $op = shift;
-    my $server = shift;
-
-    my $delsth;
-    if ($op eq 'recordDelete') {
-        # if it's a deletion, we can delete every request on this biblio : in case the user
-        # did a modif (or item deletion) just before biblio deletion, there are some specialUpdate
-        # that are pending and can't succeed, as we don't have the XML anymore
-        # so, delete everything for this biblionumber
-        $delsth = $dbh->prepare_cached("UPDATE zebraqueue SET done = 1 
-                                        WHERE biblio_auth_number = ? 
-                                        AND server = ?
-                                        AND done = 0");
-        $delsth->execute($record_number, $server);
-    } else {
-        # if it's not a deletion, delete every pending specialUpdate for this biblionumber
-        # in case the user add biblio, then X items, before this script runs
-        # this avoid indexing X+1 times where just 1 is enough.
-        $delsth = $dbh->prepare("UPDATE zebraqueue SET done = 1 
-                                 WHERE biblio_auth_number = ? 
-                                 AND operation = 'specialUpdate'
-                                 AND server = ?
-                                 AND done = 0");
-        $delsth->execute($record_number, $server);
-    }
-}
 
-sub zebrado {
-    ###Accepts a $server variable thus we can use it to update  biblios, authorities or other zebra dbs
-    my ($record, $op, $server, $record_number) = @_;
+my $authUpdateRatio;
+my $biblioUpdateRatio;
+my $tickCounter;
 
-    unless ($record) {
-        my $message = "error updating index for $server $record $record_number: no source record";
-        postpone_update($server, $record_number, $message);
-        return 0;
-    }
 
-    my $attempts = 0;
-    my $ok = 0;
-    ATTEMPT: while ($attempts < $max_operation_attempts) {
-        $attempts++;
-        warn "Attempt $attempts for $op for $server $record_number" if $debug;
-        my $Zconn = get_zebra_connection($server);
+sub handler_start
+{
+	my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];
+	my $time = localtime(time);
 
-        my $Zpackage = $Zconn->package();
-        $Zpackage->option(action => $op);
-        $Zpackage->option(record => $record);
+	print "$time Zebraqueue daemon started\n";
 
-        eval { $Zpackage->send("update") };
-        if ($@ && $@->isa("ZOOM::Exception")) {
-            my $message = _format_zoom_error_message($@);
-            my $error = $@->code();
-            if (exists $retriable_zoom_errors{$error}) {
-                warn "reattempting operation $op for $server $record_number" if $debug;
-                warn "last Zebra error was $message" if $debug;
-                $Zpackage->destroy();
-
-                if ($error == 10007 and $zconn_timeout < $max_zconn_timeout) {
-                    # bump up connection timeout
-                    $zconn_timeout = POSIX::ceil($zconn_timeout * $zconn_timeout_multiplier);
-                    $zconn_timeout = $max_zconn_timeout if $zconn_timeout > $max_zconn_timeout;
-                    Unix::Syslog::syslog LOG_INFO, "increased Zebra connection timeout to $zconn_timeout\n";
-                    warn "increased Zebra connection timeout to $zconn_timeout" if $debug;
-                }
-                next ATTEMPT;
-            } else {
-                postpone_update($server, $record_number, $message);
-            }
-        }
-        # FIXME - would be more efficient to send a ES commit
-        # after a batch of records, rather than commiting after
-        # each one - Zebra handles updates relatively slowly.
-        eval { $Zpackage->send('commit'); };
-        if ($@) {
-            # operation succeeded, but commit
-            # did not - we have a problem
-            my $message = _format_zoom_error_message($@);
-            postpone_update($server, $record_number, $message);
-        } else {
-            $ok = 1;
-            last ATTEMPT;
-        }
-    }
-
-    unless ($ok) {
-        my $message = "Made $attempts attempts to index $server record $record_number without success";
-        postpone_update($server, $record_number, $message);
-    }
-
-    return $ok; 
-}
+	# Initialize counter
+	$tickCounter  = 0;
 
-sub postpone_update {
-    my ($server, $record_number, $message) = @_;
-    warn $message if $debug;
-    $message .= "\n" unless $message =~ /\n$/;
-    Unix::Syslog::syslog LOG_ERR, $message;
-    $postponed_updates->{$server}->{$record_number} = 1;
+	# Get timer settings
+	$authUpdateRatio	= (C4::Context->preference("ZebraAuthUpdateRatio")||10);
+	$biblioUpdateRatio	= (C4::Context->preference("ZebraBiblioUpdateRatio")||6);
 
-    $num_postponed_updates++;
-    if ($num_postponed_updates > $max_postponed_updates) {
-        warn "exiting, over $max_postponed_updates postponed indexing updates";
-        Unix::Syslog::syslog LOG_ERR, "exiting, over $max_postponed_updates postponed indexing updates";
-        Unix::Syslog::closelog;
-        exit;
-    }
-}
+	# Log
+	my $authPrefsString = (C4::Context->preference("ZebraAuthUpdateRatio") ? 'syspref' : 'default');
+	print "$time Authorities update ratio (secs): $authUpdateRatio ($authPrefsString)\n";
+	my $biblioUpdateSecs = $biblioUpdateRatio * $authUpdateRatio;
+	my $biblioPrefsString = (C4::Context->preference("ZebraBiblioUpdateRatio") ? 'syspref' : 'default');
+	print "$time Biblios update ratio (secs): $biblioUpdateSecs ($biblioPrefsString)\n";
 
-sub handler_stop {
-    my $heap = $_[HEAP];
-    my $time = localtime(time);
-    Unix::Syslog::syslog LOG_INFO, "$time Session ", $_[SESSION]->ID, " has stopped.\n";
-    delete $heap->{session};
+	$kernel->delay(tick => $authUpdateRatio);
 }
 
-# get a DB connection
-sub get_db_connection {
-    my $dbh;
-
-    $db_connection_wait = $min_connection_wait unless defined $db_connection_wait;
-    while (1) {
-        eval {
-            # note that C4::Context caches the
-            # DB handle; C4::Context->dbh() will
-            # check that handle first before returning
-            # it.  If the connection is bad, it
-            # then tries (once) to create a new one.
-            $dbh = C4::Context->dbh();
-        };
 
-        unless ($@) {
-            # C4::Context->dbh dies if it cannot
-            # establish a connection
-            $db_connection_wait = $min_connection_wait;
-            $dbh->{AutoCommit} = 0; # do this to reduce number of
-                                    # commits to zebraqueue
-            return $dbh;
-        }
-
-        # connection failed
-        my $error = "failed to connect to DB: $DBI::errstr";
-        warn $error if $debug;
-        Unix::Syslog::syslog LOG_ERR, $error;
-        sleep $db_connection_wait;
-        $db_connection_wait *= 2 unless $db_connection_wait >= $max_connection_wait;
-    }
+sub handler_stop
+{
+	my $heap = $_[HEAP];
+	my $time = localtime(time);
+	# Log
+	print "$time Zebraqueue daemon stopped - POE Session ended\n";
+	delete $heap->{session};
 }
 
-# get a Zebra connection
-sub get_zebra_connection {
-    my $server = shift;
-
-    # start connection retry wait queue if necessary
-    $zoom_connection_waits{$server} = $min_connection_wait unless exists  $zoom_connection_waits{$server};
 
-    # try to connect to Zebra forever until we succeed
-    while (1) {
-        # what follows assumes that C4::Context->Zconn 
-        # makes only one attempt to create a new connection;
-        my $Zconn = C4::Context->Zconn($server, 0, 1, '', 'xml');
-        $Zconn->option('timeout' => $zconn_timeout);
+sub handler_tick
+{
+	my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];
+	my $ret = 0;
+	$tickCounter  = $tickCounter + 1;
 
-        # it is important to note that if the existing connection
-        # stored by C4::Context has an error (any type of error)
-        # from the last transaction, C4::Context->Zconn closes
-        # it and establishes a new one.  Therefore, the
-        # following check will succeed if we have a new, good 
-        # connection or we're using a previously established
-        # connection that has experienced no errors.
-        if ($Zconn->errcode() == 0) {
-            $zoom_connection_waits{$server} = $min_connection_wait;
-            return $Zconn;
-        }
 
-        # connection failed
-        my $error = _format_zoom_error_message($Zconn);
-        warn $error if $debug;
-        Unix::Syslog::syslog LOG_ERR, $error;
-        sleep $zoom_connection_waits{$server};
-        $zoom_connection_waits{$server} *= 2 unless $zoom_connection_waits{$server} >= $max_connection_wait;
-    }
-}
-
-# given a ZOOM::Exception or
-# ZOOM::Connection object, generate
-# a human-reaable error message
-sub _format_zoom_error_message {
-    my $err = shift;
+	# Calculate if we have to update biblios too
+	# Check: biblioUpdateRatio ?= tickCounter
+	if ($biblioUpdateRatio == $tickCounter) {
+		# Update biblios and auths
+		$ret = C4::Catalog::Zebra::UpdateAuthsAndBiblios();
+		# Reset counter
+		$tickCounter  = 0;
+	} else {
+		# Update only auths
+		$ret = C4::Catalog::Zebra::UpdateAuths();
+	}
 
-    my $message = "";
-    if (ref($err) eq 'ZOOM::Connection') {
-        $message = $err->errmsg() . " (" . $err->diagset . " " . $err->errcode() . ") " . $err->addinfo();
-    } elsif (ref($err) eq 'ZOOM::Exception') {
-        $message = $err->message() . " (" . $err->diagset . " " .  $err->code() . ") " . $err->addinfo();
-    }
-    return $message; 
+	$kernel->delay(tick => $authUpdateRatio);
 }
 
 POE::Session->create(
-    inline_states => {
-        _start       => \&handler_start,
-        sleep        => \&handler_sleep,
-        status_check => \&handler_check,
-        do_ops       => \&zebraop,
-        _stop        => \&handler_stop,
-    },
+	inline_states => {
+		_start       => \&handler_start,
+		tick         => \&handler_tick,
+		_stop        => \&handler_stop,
+	},
 );
 
-# start the kernel
-$poe_kernel->run();
-
-Unix::Syslog::closelog;
-
-exit;
+POE::Kernel->run();
+exit 0;
-- 
1.7.0.4




More information about the Patches mailing list