Jump to content
Wikimedia Meta-Wiki

PDF doc search II

From Meta, a Wikimedia project coordination wiki
This article is considered of unknown usefulness and may be a candidate for deletion.
If you want to revive discussion regarding the subject, you may try using the talk page or start a discussion at Meta:Babel.

Introduction

[edit ]

I edited includes/SpecialSearch.php to make the saved PDFs searchable. This contribution is limited to a small amount of PDFs. When creating the PDFs don ́t use compress text and images in e.g. acrobat distiller. SpecialSearch.php starts 2 AWK-Scripts. These scripts convert the PDFs into strings. These strings are searchable out of the wikisearchmask. Put the scripts searchAllPDFs.sh and searchPDF.sh in your mediawiki includes/ Folder. Tested under mediawiki 1.54 and LAMP.

see PDF doc search for the older not maintained code.

Authors

[edit ]

Bernd Flunger and Armin Lanzinger User:elai

Installation

[edit ]
  1. In SpecialSearch.php copy and paste the contribution from Edit SpecialSearch.php .
  2. You have to edit this line
  3. $output = shell_exec('includes/searchAllPDFs.sh -i '. $s . ' /home/xyz/mediawiki/images'); (Note: change /home/xyz/mediawiki/images to your real server path.
  4. Edit Special:Allmessages
  5. In MediaWiki namespace you have to add 2 words:
  6. Pdfmatches, Nopdfmatches http://xyz/mediawiki/index.php/MediaWiki:Nopdfmatches http://xyz/mediawiki/index.php/MediaWiki:Pdfmatches for English edit in Pdfmatches: Matches in PDFs in Nopdfmatches: No matches in PDFs for German edit in Pdfmatches: Übereinstimmungen in PDFs in Nopdfmatches: Keine Übereinstimmungen in PDFs
  7. User rights
  8. The files searchAllPDFs.sh and searchPDF.sh have to be readable by your apache-user.

Edit SpecialSearch.php

[edit ]

Put the content after line 199:
(Note: Contribution starts with ### line)
(Note: replace www.xyz.com with with your domain and replace /home/xyz/ with your mediawiki path)

 if( $titleMatches ) {
 if( $titleMatches->numRows() ) {
 $wgOut->addWikiText( '==' . wfMsg( 'titlematches' ) . "==\n" );
 $wgOut->addHTML( $this->showMatches( $titleMatches ) );
 } else {
 $wgOut->addWikiText( '==' . wfMsg( 'notitlematches' ) . "==\n" );
 }
 }
######################################
######################################
#####
##### contribution PDF doc search II
#####
 $s = $term;
 $output = shell_exec('includes/searchAllPDFs.sh -i '. $s . ' /home/xyz/mediawiki/images');
 $output_array = preg_split('/[\n\r]+/', $output);
 $n = count($output_array);
 $n = $n - 1;
 $PDFi = 0;
 $realn = 0;
 while($PDFi<=$n) {
 $output_array_short = split("\.", $output_array[$i]);
 $output_array_short[1] = $output_array_short[1].".".$output_array_short[2];
 $filename = basename($output_array[$PDFi]);
 $dirname = dirname($output_array[$PDFi]);
 $archive = strpos($dirname, 'archiv');
 $temp = strpos($dirname, 'temp');
 if ( ($temp===false) && ($archive===false) ) {
 $dirname = str_replace("/home/xyz/mediawiki/images", "", $dirname);
 $wgOut->addHTML( "$ausgabe \n" );
# $ausgabe = "<li>(<a href=\"http://www.xyz.com/mediawiki/index.php/Bild:$filename\">Beschreibung</a>) <a href=\"http://www.xyz.com/mediawiki/images$dirname/$filename\">$filename</a></li>";
 $ausgabe_array[$realn] = "<li>(<a href=\"http://www.xyz.com/mediawiki/index.php/Bild:$filename\">Beschreibung</a>) <a href=\"http://www.xyz.com/mediawiki/images$dirname/$filename\">$filename</a></li>";
 $realn++;
 }
 $PDFi++;
 }
 $PDFi = 0;
 $realn--;
 $PDFi = 0;
 if ($realn>0) {
 $wgOut->addWikiText( '==' . wfMsg( 'pdfmatches' ) . "==\n" );
 $PDFsuchabfrage = "<p></p>\n Ihr Suchbegriff wurde <b>$realn</b> mal gefunden. Ihr Suchbegriff: <b>$s</b> ";
 $wgOut->addHTML( "$PDFsuchabfrage<br /> " );
 } else {
 $wgOut->addWikiText( '==' . wfMsg( 'nopdfmatches' ) . "==\n" );
 }
 while($PDFi<$realn) {
 if($ol!=1) {
 $wgOut->addHTML( " <ol start='1' style='none'>" );
 $ol=1;
 }
 $wgOut->addHTML( "$ausgabe_array[$PDFi]\n" );
 $PDFi++;
 }
 $wgOut->addHTML( "</ol>" );
# $wgOut->addHTML( "<b>$PDFsearchstring</b><br /> " );
###############################
###############################
 


searchAllPDFs.sh

[edit ]
 
#!/bin/bash
#---------------------------------------------------
bindir=includes
#---------------------------------------------------
if [ "1ドル" = "-i" ]
then
 ic=1ドル
 shift
fi
#---------------------------------------------------
what=1ドル
if [ "$what" = "" ]
then
 echo ""
 echo "usage: 0ドル [-i] <what> [<where>]"
 echo " -i ... ignore case"
 echo " <what> ... what to search for"
 echo " <where> ... where (directory) to search for"
 echo ""
 exit
fi
shift
#---------------------------------------------------
#curdir=$(pwd)
dir=1ドル
if [ "$dir" = "" ]
then
 dir="./"
fi
#---------------------------------------------------
find $dir -name "*pdf" -exec ${bindir}/searchPDF.sh $ic $what \{\} \; -print
#---------------------------------------------------
#cd $curdir
#---------------------------------------------------
 

searchPDF.sh

[edit ]
#!/bin/bash
#----------------------------------------------------------
if [ "1ドル" = "-i" ]
then
 ic=1ドル
 shift
fi
#----------------------------------------------------------
what=1ドル
file=2ドル
#----------------------------------------------------------
#strings $file | grep $ic $what |grep $ic -v "/$what" 1>/dev/null 2>&1
gawk 'BEGIN{
 n=0
}
{
 if(0ドル ~ ")Tj$"){
 n++
 txt=substr(0,2,ドルlength(0ドル)-4)
 print txt
 next
 }
 if(0ドル ~ ")]TJ$"){
 n++
 #printf ">>>"0ドル"<<<"
 txt=substr(0,3,ドルlength(0ドル)-6)
 ka="@klammer-auf@"
 kz="@klammer-zu@"
 kb="@backslash@"
 gsub(/\\\(/,ka,txt)
 gsub(/\\\)/,kz,txt)
 gsub(/\\\\/,kb,txt)
 gsub("[)][^(]+[(]","",txt)
 gsub(ka,"(",txt)
 gsub(kz,")",txt)
 gsub(kb,"\\",txt)
 print txt
 next
 }
}
END{
 if(n>0){
 exit 0
 }else{
 exit 1
 }
}' $file | grep $ic $what> /dev/null
#----------------------------------------------------------
if [ $? = 0 ]
then
 exit 0
else
 exit 1
fi
#----------------------------------------------------------
 

AltStyle によって変換されたページ (->オリジナル) /