Jump to content
Wikimedia Meta-Wiki

Wikix

From Meta, a Wikimedia project coordination wiki
This is an archived version of this page, as edited by Jmerkey (talk | contribs) at 20:13, 24 April 2007 (Source Code ). It may differ significantly from the current version .

Wikix is a 'C' based program that will read any XML dump provided by the foundation, extract all image names from the XML dump which it may reference, then generate a series of BASH or Bourne Unix style scripts which can be invoked to download all images from Wikipedia Commons and Wikipedia.

The program relies on Curl, an automated web spyder, to download referenced images. The program will also convert text based utf8 characters into actual utf8 strings for those dumps which may contain imporperly formatted names for specific images. The program can be configured to generate 16 parallel scripts which will download all images from Wikipedia.

On a cable modem, the entire set of Wikipedia images can be downloaded in about 36 hours using this program.


Source Code

wikix.c

#include "platform.h"
#ifdef WINDOWS
#define strncasecmp strnicmp
#include "windows.h"
#include "winioctl.h"
#include "winuser.h"
#include "stdarg.h"
typedef UCHAR BYTE;
typedef USHORT WORD;
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "conio.h"
#endif
#ifdef LINUX
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
//#include <ncurses.h>
#include <termios.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <ctype.h>
#include <openssl/md5.h>
#endif
#define NAME_HASH_SIZE 8192
typedef struct _hash
{
 struct _hash *next;
 struct _hash *prior;
 unsigned long len;
 char *text;
} hash;
typedef struct _hash_list {
 hash *head;
 hash *tail;
} hash_list;
unsigned char buffer[8192 * 4];
unsigned char ImagePath[512];
unsigned char OutputPath[512];
unsigned char iPath[512];
unsigned char cPath[512];
unsigned char md5_out[1024];
unsigned char md5_ulout[1024];
unsigned char wk[8192];
unsigned char final1[4096];
unsigned char final2[4096];
unsigned char ulwk[4096];
unsigned char fwk[4096];
unsigned char expand[4096];
unsigned char html[4096];
FILE *fpl[16];
int pmode = 0, tree = 0;
hash_list *learn_list_head = NULL;
hash_list *name_list_head = NULL;
int lobj = 0;
FILE *imagelog = NULL, *imagereject = NULL, *fragmentlog = NULL;
unsigned long shash(char *v, unsigned long len, unsigned long M)
{
 register unsigned long h = 0, a = 127, i;
 for (i = 0; i < len && *v; v++, i++)
 h = ((a * h) + tolower(*v)) % M;
 return h;
}
unsigned long add_to_hash(hash_list *top, hash *name)
{
 register unsigned long Value;
 register hash_list *HashTable;
 Value = shash(name->text, name->len, NAME_HASH_SIZE);
 if (Value == (unsigned long) -1)
 return -1;
 HashTable = (hash_list *) top;
 if (HashTable)
 {
 if (!HashTable[Value].head)
 {
	 HashTable[Value].head = name;
	 HashTable[Value].tail = name;
	 name->next = name->prior = 0;
 }
 else
 {
	 HashTable[Value].tail->next = name;
	 name->next = 0;
	 name->prior = HashTable[Value].tail;
	 HashTable[Value].tail = name;
 }
 return 0;
 }
 return -1;
}
unsigned long remove_from_hash(hash_list *top, hash *name)
{
 register unsigned long Value;
 register hash_list *HashTable;
 Value = shash(name->text, name->len, NAME_HASH_SIZE);
 if (Value == (unsigned long) -1)
 return -1;
 HashTable = (hash_list *) top;
 if (HashTable)
 {
 if (HashTable[Value].head == name)
 {
	 HashTable[Value].head = name->next;
	 if (HashTable[Value].head)
	 HashTable[Value].head->prior = NULL;
	 else
	 HashTable[Value].tail = NULL;
 }
 else
 {
	 name->prior->next = name->next;
	 if (name != HashTable[Value].tail)
	 name->next->prior = name->prior;
	 else
	 HashTable[Value].tail = name->prior;
 }
 if (lobj)
 lobj--;
 return 0;
 }
 return -1;
}
void free_hash(void)
{
 register int i;
 register hash_list *HashTable;
 register hash *tmp, *name;
 if (learn_list_head)
 {
 HashTable = (hash_list *) learn_list_head;
 for (i=0; i < NAME_HASH_SIZE; i++)
 {
 name = HashTable[i].head;
 HashTable[i].head = HashTable[i].tail = 0;
 while (name)
 {
 tmp = name;
 name = name->next;
 free((void *)tmp);
 }
 }
 free(learn_list_head);
 learn_list_head = NULL;
 }
 if (name_list_head)
 {
 HashTable = (hash_list *) name_list_head;
 for (i=0; i < NAME_HASH_SIZE; i++)
 {
 name = HashTable[i].head;
 HashTable[i].head = HashTable[i].tail = 0;
 while (name)
 {
 tmp = name;
 name = name->next;
 free((void *)tmp);
 }
 }
 free(name_list_head);
 name_list_head = NULL;
 }
}
hash_list *init_hash_list(void)
{
 learn_list_head = (hash_list *) malloc(sizeof(hash_list) * NAME_HASH_SIZE);
 if (!learn_list_head)
 return NULL;
 memset(learn_list_head, 0, sizeof(hash_list) * NAME_HASH_SIZE);
 name_list_head = (hash_list *) malloc(sizeof(hash_list) * NAME_HASH_SIZE);
 if (!name_list_head)
 return NULL;
 memset(name_list_head, 0, sizeof(hash_list) * NAME_HASH_SIZE);
 return learn_list_head;
}
hash *search_name_hash(hash_list *top, char *text, unsigned long len)
{
 register unsigned long Value;
 register hash *name;
 register hash_list *HashTable;
 Value = shash(text, len, NAME_HASH_SIZE);
 HashTable = (hash_list *) top;
 name = (hash *) HashTable[Value].head;
 while (name)
 {
 if (len == name->len) 
 {
	 if (!strncasecmp(name->text, text, len))
	 return (hash *) name;
 }
 name = name->next;
 }
 return NULL;
}
int learn(char *s, int len)
{
 register hash *name;
 name = search_name_hash(learn_list_head, s, len);
 if (name)
 return 1;
 name = malloc(sizeof(hash) + len + 2);
 if (!name)
 return 1;
 memset(name, 0, sizeof(hash) + len);
 name->text = (char *)((unsigned long)name + sizeof(hash));
 name->len = len;
 strncpy(name->text, s, len);
 if (add_to_hash(learn_list_head, name) == -1)
 {
 free(name);
 return 1;
 }
 lobj++;
 return 0;
}
int imagename(char *s, int len)
{
 register hash *name;
 name = search_name_hash(name_list_head, s, len);
 if (name)
 return 1;
 name = malloc(sizeof(hash) + len + 2);
 if (!name)
 return 1;
 memset(name, 0, sizeof(hash) + len);
 name->text = (char *)((unsigned long)name + sizeof(hash));
 name->len = len;
 strncpy(name->text, s, len);
 if (add_to_hash(name_list_head, name) == -1)
 {
 free(name);
 return 1;
 }
 lobj++;
 return 0;
}
unsigned char *nprintf(char *s, int len, FILE *fp)
{
 register int i;
 if (!s || !*s)
 return s;
 for (i=0; *s && (i < len); i++)
 putc(*s++, fp);
 return s;
}
unsigned char *str8rchr(const char * s, int c1, int c2, int c3, int c4, 
 int c5, int c6, int c7, int c8)
{
 const char *p = s + strlen(s);
 do {
 if ((*p == (char)c1) || (*p == (char)c2) || (*p == (char)c3) ||
 (*p == (char)c4) || (*p == (char)c5) || (*p == (char)c6) || 
 (*p == (char)c7) || (*p == (char)c8))
 return (char *)p;
 } while (--p >= s);
 return NULL;
}
unsigned char *str5rchr(const char * s, int c1, int c2, int c3, int c4, 
 int c5)
{
 const char *p = s + strlen(s);
 do {
 if ((*p == (char)c1) || (*p == (char)c2) || (*p == (char)c3) ||
 (*p == (char)c4) || (*p == (char)c5))
 return (char *)p;
 } while (--p >= s);
 return NULL;
}
char *strnstr(const char * s1,const char * s2)
{
	int l1, l2;
	l2 = strlen(s2);
	if (!l2)
		return (char *) s1;
	l1 = strlen(s1);
	while (l1 >= l2) {
		l1--;
		if (!strncasecmp(s1,s2,l2))
			return (char *) s1;
		s1++;
	}
	return NULL;
}
unsigned char *imagetypes[]=
{
 // 7
 ".svg+xml",
 ".xcf.bz2", 
 // 6
 ".bitmap", 
 ".xcfbz2", 
 // 5
 ".xcfgz", 
 ".alpha", 
 ".dicom", 
 ".matte", 
 ".xjtgz", 
 // 4
 ".mask", 
 ".aifc",
 ".aiff",
 ".fits", 
 ".icon", 
 ".im24",
 ".im32", 
 ".jpeg", 
 ".midi", 
 ".mpeg", 
 ".xwav", 
 ".mpga", 
 ".tiff", 
 // 3
 ".aif",
 ".als", 
 ".apm", 
 ".bmp", 
 ".bz2", 
 ".cel", 
 ".dcm", 
 ".eps", 
 ".fit", 
 ".flc", 
 ".fli", 
 ".gbr",
 ".gif", 
 ".gih", 
 ".gpb", 
 ".ico", 
 ".im1", 
 ".im8", 
 ".jpe", 
 ".jpg", 
 ".kar",
 ".mid",
 ".mov", 
 ".mp2",
 ".mp3", 
 ".mp4", 
 ".mpa",
 ".mpg", 
 ".ogg", 
 ".ogm", 
 ".pcc", 
 ".pcx", 
 ".pdf", 
 ".pdm", 
 ".pgm", 
 ".pix", 
 ".png", 
 ".pnm", 
 ".ppm",
 ".psd", 
 ".psp", 
 ".ras", 
 ".rgb",
 ".sgi", 
 ".svg", 
 ".swf",
 ".tga", 
 ".tif",
 ".tub", 
 ".wav", 
 ".wmf", 
 ".xbm", 
 ".xcf", 
 ".xjt", 
 ".xpm", 
 ".xwd", 
 ".pov", 
 ".wma", 
 ".dia", 
 ".fig", 
 ".jif", 
 ".pgn", 
 ".art", 
 // 2
 ".bw", 
 ".ps", 
 ".g3", 
 ".js",
 ".rs", 
};
unsigned char *strip_image_info(unsigned char *s, char *title)
{
 register int i;
 unsigned char *p, *j;
 FILE *fp = stdout;
 unsigned char ch = '0円';
 while (*s && (isspace(*s))) s++;
 if (!strncasecmp(s, "no image", 8))
 return s;
 p = s;
 while (*s)
 {
 if ((!strncasecmp(s, "image", 5) || 
 !strncasecmp(s, "map", 3)) && !isalnum(ch))
 {
 unsigned char *fragment, *end;
 fragment = s;
 if (!strncasecmp(s, "image", 5))
 s += 5;
 else
 if (!strncasecmp(s, "map", 3)) 
 s += 3;
 if (*s)
 {
 while (*s && isalnum(*s)) s++;
 end = s;
 while (*s && isspace(*s)) s++;
 if (*s && *s == '=' || *s == ':')
 {
 memset(&fwk[0], 0, 256); 
 memmove(&fwk[0], fragment, (end - fragment));
 if (!learn(&fwk[0], end - fragment))
 {
 if (*title)
 fprintf(fragmentlog, "[%s] %s\n", title, &fwk[0]);
 else
 fprintf(fragmentlog, "%s\n", &fwk[0]);
 fflush(fragmentlog);
 }
 s++;
 s = strip_image_info(s, title);
 ch = '0円';
 }
 } 
 continue;
 }
 if ((*s == '|') || (*s == ']') || (*s == '\n'))
 {
 register int y;
 unsigned char ch = '\x22', *l;
 unsigned char dir1[32], dir2[32]; 
 unsigned char *lp, *lw, *lo, *delim, *blp;
 unsigned char *ulp, *fname, *bulp;
 register int cnvt = 0, bcnvt = 0, unicnvt = 0, invl = 0;
 
 lp = &wk[0];
 j = lp;
 while (*p && (p < s))
 {
 // skip self referencing images
 if (!strncasecmp(p, "{{", 2))
 return s; 
 if (!memcmp(p, """, 6))
 {
 p += 6;
 *j++ = '\x22';
 }
 if (!memcmp(p, "&", 5))
 {
 p += 5;
 *j++ = '&';
 }
 if (!memcmp(p, "<", 4))
 {
 p += 4;
 while (*p)
 {
 if (!memcmp(p, ">", 4))
 {
 p += 4;
 break; 
		 }
 p++;
 } 
 }
 if (*p == '\n')
 p++;
 if (!memcmp(p, "[[", 2))
 break;
 *j++ = *p++;
 }
 *j = '0円';
 s++;
 
 for (j=NULL, y=0; y < (sizeof(imagetypes) / sizeof (char *)); y++)
 {
 j = strnstr(lp, imagetypes[y]);
 if (j)
 {
 register int ilen = strlen(imagetypes[y]);
 j += ilen;
 *j = '0円';
 break;
 }
 }
 if (!j)
 {
 if (*lp && isalpha(*lp))
 {
 unsigned char *sp = strchr(lp, '.');
 unsigned char *sj, *slp = lp;
 if (sp)
 {
 unsigned char *sllp = sp, *meter;
 sllp++;
 if ((*sllp != ' ') && (isalpha(*sllp)))
 {
 sj = str8rchr(slp, ':', '/', '\\', '{', '\n', '&', 
 '=', '>');
 if (sj)
 slp = ++sj;
 meter = sllp;
 while (*sllp)
 {
 if (!isalpha(*sllp))
 {
 *sllp = '0円';
 break;
 }
 sllp++;
 }
 if (*slp && 
 (((sllp - meter) >= 3) && ((sllp - meter) <= 5)))
 {
 if (*title)
 fprintf(imagereject, "[%s] %s\n", title, slp);
 else
 fprintf(imagereject, "%s\n", slp);
 fflush(imagereject);
 }
 }
 }
 }
 return s;
 }
 j = str5rchr(lp, ':', '/', '\\', '{', '\n');
 if (j)
 lp = ++j;
 
 if (!*lp)
 return s;
#ifdef UNICODE_EXPANSION
 // filename string extracted. convert xml control character tags
 l = &expand[0];
 ulp = lp;
 while (*ulp)
 {
 if (!strncasecmp(ulp, "&", 5))
 {
 ulp += 5;
 *l++ = '&';
 continue;
 }
 if (!strncasecmp(ulp, "<", 4))
 {
 ulp += 4;
 *l++ = '<';
 continue;
 }
 if (!strncasecmp(ulp, ">", 4))
 {
 ulp += 4;
 *l++ = '>';
 continue;
 }
 if (!strncasecmp(ulp, """, 6))
 {
 ulp += 6;
 *l++ = '\"';
 continue;
 }
 if (!strncasecmp(ulp, "'", 6))
 {
 ulp += 6;
 *l++ = '\'';
 continue;
 }
 if (!strncasecmp(ulp, " ", 6))
 {
 ulp += 6;
 *l++ = ' ';
 continue;
 }
 if (!strncasecmp(ulp, "–", 6))
 {
 ulp += 6;
 *l++ = '-';
 continue;
 }
 if ((ulp[0] == '&') && (ulp[1] != '&'))
 {
 unsigned char *sc = strchr(ulp, ';'), *slp;
 unsigned char unicode[32];
 unsigned char unidest[32];
 unsigned short uni;
 
 if (sc)
 {
 slp = ulp;
 slp++;
 while (*slp != ';')
 {
 if ((*slp == '#') || (*slp == '-') ||
 (*slp == 'x') || (*slp == 'X') ||
 isxdigit(*slp))
 slp++;
 else
 {
 invl = 1;
 break;
 }
 }
 if (!invl)
 {
 int unilen = sc - ulp;
 int slen = sc - ulp;
 slp = ulp;
 slp++;
 unilen--;
 if (*slp == '#')
 {
 unilen--;
 slp++;
 }
 if (unilen < 31)
 {
 memset(unicode, 0, 32);
 strncpy(unicode, slp, unilen);
 uni = atoi(unicode);
 fprintf(imagelog, "UNI1: %s (#%d) %s \n", 
 unicode, (int)uni,
 lp);
 unicode[0] = '0円';
 sprintf(unicode, "\\u%04X", uni);
 unilen = u8_unescape(l, 32, unicode);
 
 fprintf(imagelog, "UNI2: %s unilen %d slen %d\n", 
 unicode, (int)unilen, (int)slen);
 ulp += slen;
 l += unilen;
 ulp++;
 unicnvt = 1;
 continue;
 }
 }
 }
 }
 *l++ = *ulp++;
 }
 *l = '0円'; 
 lp = &expand[0];
#endif
 // convert spaces to underline characters in image names 
 ulp = &ulwk[0];
 memmove(ulp, lp, strlen(lp) + 1);
 ulp[0] = toupper(ulp[0]); 
 {
 l = ulp;
 while (*l)
 {
 if (*l == ' ')
 {
 *l = '_';
 cnvt = 1;
 }
 l++;
 }
 }
 if (learn(lp, strlen(lp)))
 return s;
 if (cnvt && learn(ulp, strlen(ulp)))
 return s;
 memset(md5_out, 0, 16);
 lp[0] = toupper(lp[0]); 
#ifdef UNICODE_EXPANSION
 if (unicnvt || invl)
 {
 if (invl)
 fprintf(imagelog, "INVL: %s -> %s\n", wk, lp);
 else
 fprintf(imagelog, "%s -> %s\n", wk, lp);
 fflush(imagelog);
 if (invl)
 return s;
 }
 else
 return s;
#else
 fprintf(imagelog, "%s\n", lp);
 fflush(imagelog);
#endif
 MD5(lp, strlen(lp), md5_out);
 dir1[0] = '0円'; 
 sprintf(dir1, "%x/%02x/", (md5_out[0] >> 4), md5_out[0]);
 if (cnvt)
 {
 memset(md5_ulout, 0, 16);
 ulp[0] = toupper(ulp[0]); 
 MD5(ulp, strlen(ulp), md5_ulout);
 dir2[0] = '0円'; 
 sprintf(dir2, "%x/%02x/", (md5_ulout[0] >> 4), md5_ulout[0]);
 }
 // add trailing \\ characters to bash control chars
 fname = &final1[0];
 blp = lp;
 while (*blp)
 {
 if ((*blp == '\"') || (*blp == '\'') || (*blp == '`'))
 {
 bcnvt = 1; 
 *fname++ = '\\';
 }
 else
 if ((*blp == ' ') || (*blp == '(') || (*blp == ')') ||
 (*blp == '{') || (*blp == '}') || (*blp == '[') || 
 (*blp == ']') || (*blp == '&') || (*blp == '-') ||
 (*blp == ';'))
 *fname++ = '\\';
 *fname++ = *blp++;
 }
 *fname = '0円';
 blp = &final1[0];
 // add trailing \\ characters to bash control chars
 fname = &final2[0];
 bulp = ulp;
 while (*bulp)
 {
 if ((*bulp == '\"') || (*bulp == '\'') || (*bulp == '`'))
 {
 bcnvt = 1; 
 *fname++ = '\\';
 }
 else
 if ((*bulp == ' ') || (*bulp == '(') || (*bulp == ')') ||
 (*bulp == '{') || (*bulp == '}') || (*bulp == '[') || 
 (*bulp == ']') || (*bulp == '&') || (*bulp == '-') ||
 (*bulp == ';'))
 *fname++ = '\\';
 *fname++ = *bulp++;
 }
 *fname = '0円';
 bulp = &final2[0];
 // debug of control characters
// if (!bcnvt)
// return s;
 if (tree)
 {
 if (pmode)
 fp = fpl[(md5_out[0] >> 4) % 16];
 fprintf(fp, "if [ -a $IMAGE./%s%s ]; then\n", 
 dir1, blp);
 fprintf(fp, "\t/bin/mkdir -p $OUTPUT./%s\n", dir1);
 fprintf(fp, "\tcp -f $IMAGE./%s%s $OUTPUT./%s%s\n", 
 dir1, blp, dir1, blp);
 fprintf(fp, "\techo ./%s%s copied to $OUTPUT./%s%s >> "
 "copied.log\n", dir1, blp, dir1, blp);
 if (cnvt) 
 {
 fprintf(fp, "elif [ -a $IMAGE./%s%s ]; then\n", 
 dir2, bulp);
 fprintf(fp, "\t/bin/mkdir -p $OUTPUT./%s\n", dir2);
 fprintf(fp, "\tcp -f $IMAGE./%s%s $OUTPUT./%s%s\n", 
 dir2, bulp, dir2, bulp);
 fprintf(fp, "\techo ./%s%s copied to $OUTPUT./%s%s >> "
 "copied.log\n", dir2, bulp, dir2, bulp);
 }
 fprintf(fp, "else\n");
 fprintf(fp, 
 "\techo ./%s%s file not found >> failed.log\n", dir1, 
 blp);
 fprintf(fp, "fi\n\n");
 }
 else
 {
 if (pmode)
 fp = fpl[(md5_out[0] >> 4) % 16];
 fprintf(fp, "if [ -a $IMAGE./%s%s ]; then\n", 
 dir1, blp);
 fprintf(fp, "\techo %s%s already exists >> exists.log\n", 
 dir1, blp);
 if (cnvt) 
 {
 fprintf(fp, "elif [ -a $IMAGE./%s%s ]; then\n", 
 dir2, bulp);
 fprintf(fp, "\techo %s%s already exists >> exists.log\n", 
 dir2, bulp);
 }
 fprintf(fp, "else\n");
 fprintf(fp, "\tcurl --retry 7 -f -O $IMAGEPATH./%s%s\n",
 dir1, blp);
 fprintf(fp, "\tif [ -a $IMAGE./%s ]; then\n", blp);
 fprintf(fp, "\t\t/bin/mkdir -p $OUTPUT./%s\n", dir1);
 fprintf(fp, "\t\t/bin/mv ./%s $OUTPUT./%s\n", 
 blp, dir1);
 fprintf(fp, "\t\techo ./%s%s downloaded >> download.log\n", 
 dir1, blp);
 fprintf(fp, "\telse\n");
 fprintf(fp, "\t\tcurl --retry 7 -f -O $COMMONSPATH./%s%s\n",
 dir1, blp);
 fprintf(fp, "\t\tif [ -a $IMAGE./%s ]; then\n", 
 blp);
 fprintf(fp, "\t\t\t/bin/mkdir -p $OUTPUT./%s\n", dir1);
 fprintf(fp, "\t\t\t/bin/mv ./%s $OUTPUT./%s\n", 
 blp, dir1);
 fprintf(fp, "\t\t\techo ./%s%s downloaded >> download.log\n", 
 dir1, blp);
 fprintf(fp, "\t\telse\n");
 if (cnvt)
 {
 fprintf(fp, "\t\t\tcurl --retry 7 -f -O $IMAGEPATH./%s%s\n",
 dir2, bulp);
 fprintf(fp, "\t\t\tif [ -a $IMAGE./%s ]; then\n", 
 bulp);
 fprintf(fp, "\t\t\t\t/bin/mkdir -p $OUTPUT./%s\n", 
 dir2);
 fprintf(fp, "\t\t\t\t/bin/mv ./%s $OUTPUT./%s\n", 
 bulp, dir2);
 fprintf(fp, "\t\t\t\techo ./%s%s downloaded >> "
 "download.log\n", dir2, bulp);
 fprintf(fp, "\t\t\telse\n");
 fprintf(fp, "\t\t\t\tcurl --retry 7 -f -O $COMMONSPATH./%s%s\n",
 dir2, bulp);
 fprintf(fp, "\t\t\t\tif [ -a $IMAGE./%s ]; then\n", 
 bulp);
 fprintf(fp, "\t\t\t\t\t/bin/mkdir -p $OUTPUT./%s\n", 
 dir2);
 fprintf(fp, "\t\t\t\t\t/bin/mv ./%s $OUTPUT./%s\n", 
 bulp, dir2);
 fprintf(fp, "\t\t\t\t\techo ./%s%s downloaded >> "
 "download.log\n", dir2, bulp);
 fprintf(fp, "\t\t\t\telse\n");
 fprintf(fp, "\t\t\t\t\techo ./%s%s failed >> failed.log\n", 
 dir1, blp);
 fprintf(fp, "\t\t\t\t\techo ./%s%s failed >> failed.log\n", 
 dir2, bulp);
 fprintf(fp, "\t\t\t\tfi\n");
 fprintf(fp, "\t\t\tfi\n");
 }
 else
 {
 fprintf(fp, 
 "\t\t\techo ./%s%s failed >> failed.log\n", dir1, 
 blp);
 }
 fprintf(fp, "\t\tfi\n");
 fprintf(fp, "\tfi\n");
 fprintf(fp, "fi\n\n");
 }
 return s;
 }
 ch = *s;
 s++;
 }
 return s;
}
int main(int argc, char *argv[])
{
 register int i, r, inpage = 0;
 unsigned char *s, *j, fname[32], *buffer, *title, *title_p;
 FILE *fl;
 ImagePath[0] = '0円';
 OutputPath[0] = '0円';
 // http://upload.wikimedia.org/wikipedia/en/
 // http://upload.wikimedia.org/wikipedia/commons/
 iPath[0] = '0円';
 cPath[0] = '0円';
 strcpy(iPath, "http://upload.wikimedia.org/wikipedia/en/");
 strcpy(cPath, "http://upload.wikimedia.org/wikipedia/commons/");
 for (i=0; i < argc; i++)
 {
 // remote path
 if (!memcmp(argv[i], "-h", 2))
 {
 printf("USAGE: wikix -htrciop < file.xml [ > script.out ]\n");
 printf(" -h this help screen\n");
 printf(" -t use xml dump to strip from tree\n");
 printf(" -r wikipedia path\n");
 printf(" -c commons path\n");
 printf(" -i image path\n");
 printf(" -o output path\n");
 printf(" -p parallel (16 process) mode\n");
 exit(1);
 }
 // remote path
 if (!memcmp(argv[i], "-t", 2))
 {
 tree = 1;
 }
 // remote path
 if (!memcmp(argv[i], "-r", 2))
 {
 i++;
 if (argv[i])
 strncpy(iPath, argv[i], 256);
 }
 // commons
 if (!memcmp(argv[i], "-c", 2))
 {
 i++;
 if (argv[i])
 strncpy(cPath, argv[i], 256);
 }
 // image tree
 if (!memcmp(argv[i], "-i", 2))
 {
 i++;
 if (argv[i])
 strncpy(ImagePath, argv[i], 256);
 }
 // output image tree
 if (!memcmp(argv[i], "-o", 2))
 {
 i++;
 if (argv[i])
 strncpy(OutputPath, argv[i], 256);
 }
 //parallel thread mode (16 processes)
 if (!memcmp(argv[i], "-p", 2))
 {
 pmode = 1;
 }
 }
 memset(&fwk[0], 0xFF, 256); 
 if (!init_hash_list())
 {
 printf("wikix: could not allocate workspace\n");
 exit(1);
 }
 buffer = malloc(0x10000);
 if (!buffer)
 {
 printf("gfdl-wikititle: could not allocate buffer workspace\n");
 exit(1);
 }
 buffer[0] = '0円';
 title = malloc(0x10000);
 if (!title)
 {
 printf("gfdl-wikititle: could not allocate namespace\n");
 exit(1);
 }
 title[0] = '0円';
 if (!pmode)
 {
 printf("#!/bin/sh\n\n");
 printf("IMAGE=%s\n", ImagePath);
 printf("OUTPUT=%s\n", OutputPath);
 printf("IMAGEPATH=%s\n", iPath);
 printf("COMMONSPATH=%s\n\n", cPath);
 printf("/bin/mkdir -p $OUTPUT./thumb\n");
 printf("/bin/chmod 777 $OUTPUT./thumb\n");
 printf("/bin/mkdir -p $OUTPUT./temp\n");
 printf("/bin/chmod 777 $OUTPUT./temp\n");
 printf("/bin/mkdir -p $OUTPUT./tmp\n");
 printf("/bin/chmod 777 $OUTPUT./tmp\n\n");
 }
 else
 {
 fl = fopen("image_sh", "w");
 if (!fl)
 {
 printf("FILE error could not create image_sh\n");
 exit(1);
 }
 chmod("image_sh", 0755);
 fprintf(fl, "#!/bin/sh\n\n");
 fprintf(fl, "IMAGE=%s\n", ImagePath);
 fprintf(fl, "OUTPUT=%s\n", OutputPath);
 fprintf(fl, "IMAGEPATH=%s\n", iPath);
 fprintf(fl, "COMMONSPATH=%s\n\n", cPath);
 fprintf(fl, "/bin/mkdir -p $OUTPUT./thumb\n");
 fprintf(fl, "/bin/chmod 777 $OUTPUT./thumb\n");
 fprintf(fl, "/bin/mkdir -p $OUTPUT./temp\n");
 fprintf(fl, "/bin/chmod 777 $OUTPUT./temp\n");
 fprintf(fl, "/bin/mkdir -p $OUTPUT./tmp\n");
 fprintf(fl, "/bin/chmod 777 $OUTPUT./tmp\n\n");
 for (r=0; r < 16; r++)
 {
 fname[0] = '0円';
 sprintf(fname, "image%02d", r);
 fpl[r] = fopen(fname, "w");
 if (!fpl[r])
 {
 printf("FILE error could not create [%s]\n", fname);
 exit(1);
 }
 chmod(fname, 0755);
 fprintf(fpl[r], "#!/bin/sh\n\n");
 fprintf(fpl[r], "\nIMAGE=%s\n", ImagePath);
 fprintf(fpl[r], "OUTPUT=%s\n", OutputPath);
 fprintf(fpl[r], "IMAGEPATH=%s\n", iPath);
 fprintf(fpl[r], "COMMONSPATH=%s\n\n", cPath);
 fprintf(fl, "./%s >& imagelog.%02d &\n", fname, r);
 }
 fclose(fl);
 }
 
 imagelog = fopen("image.log", "wb");
 if (!imagelog)
 {
 printf("FILE error could not create image log\n");
 }
 imagereject = fopen("reject.log", "wb");
 if (!imagereject)
 {
 printf("FILE error could not create reject log\n");
 }
 fragmentlog = fopen("fragment.log", "wb");
 if (!fragmentlog)
 {
 printf("FILE error could not create image name fragment log\n");
 }
 while (s = fgets(buffer, 8192 * 4, stdin))
 {
 unsigned char ch = '0円';
 if (strstr(s, "<page>"))
 {
 inpage++;
 if (*title)
 *title = '0円';
 continue;
 }
 if (strstr(s, "</page>"))
 {
 if (inpage)
 inpage--;
 if (*title)
 *title = '0円';
 continue; 
 }
 title_p = strstr(s, "<title>");
 if (inpage && title_p)
 {
 register char *ts, *tp;
 ts = title_p;
 ts += 7;
 tp = strstr(ts, "</title>");
 if (tp)
 {
 if (tp - ts)
 {
 strncpy(title, ts, tp - ts);
 title[tp - ts] = '0円';
 }
 }
 }
 while (*s)
 {
 if (inpage && !strncasecmp(s, "<title>", 7))
 {
 register char *ts, *tp;
 s += 7;
 ts = s;
 tp = strstr(ts, "</title>");
 if (tp)
 {
 if (tp - ts)
 {
 strncpy(title, ts, tp - ts);
 title[tp - ts] = '0円';
 }
 }
 }
 if ((!strncasecmp(s, "image", 5) || 
 !strncasecmp(s, "map", 3)) && 
 !isalnum(ch))
 {
 unsigned char *fragment, *end;
 fragment = s;
 if (!strncasecmp(s, "image", 5))
 s += 5;
 else 
 if (!strncasecmp(s, "map", 3)) 
 s += 3;
 if (*s)
 {
 while (*s && isalnum(*s)) s++;
 end = s;
 while (*s && isspace(*s)) s++;
 if (*s && (*s == '=' || *s == ':'))
 {
 memset(&fwk[0], 0, 256); 
 memmove(&fwk[0], fragment, (end - fragment));
 if (!imagename(&fwk[0], end - fragment))
 {
 if (*title)
 fprintf(fragmentlog, "[%s] %s\n", title, &fwk[0]);
 else
 fprintf(fragmentlog, "%s\n", &fwk[0]);
 fflush(fragmentlog);
 }
 s++;
 s = strip_image_info(s, title);
 ch = '0円';
 }
 } 
 continue;
 }
 ch = *s;
 s++;
 }
 }
 if (pmode)
 {
 for (r=0; r < 16; r++)
 {
 if (!fpl[r])
 fclose(fpl[r]);
 fpl[r] = NULL;
 }
 }
 fclose(fragmentlog);
 fclose(imagelog);
 fclose(imagereject);
 free(title);
 free(buffer);
 free_hash();
 return 0;
}


AltStyle によって変換されたページ (->オリジナル) /