Uporabnik:Karolina~slwiki/Wikipedija:Osebni podatki/transform.pl: Razlika med redakcijama
Izbrisana vsebina Dodana vsebina
Nova stran: #!/usr/bin/perl -w ########################## # Transform Persondata ########################## # Parse date entry and expand each date field to 8 fields # Likewise the articles a... |
(ni razlike)
|
Redakcija: 21:23, 2. oktober 2009
- !/usr/bin/perl -w
########################## # Transform Persondata ########################## # Parse date entry and expand each date field to 8 fields # Likewise the articles about places and first/last names ########################## my %months=( "January" => 1, "February" => 2, "March" => 3, "April" => 4, "May" => 5, "June" => 6, "July" => 7, "August" => 8, "September" => 9, "October" => 10, "November" => 11, "December" => 12 ); my %monthname= (); foreach $month (keys %months) { # inverse $monthname{$months{$month}} = $month; } my $MONTH = "(" . join("|",keys %months) . ")"; my $YEAR = "(\\d{1,4})"; my $PREFIX = "([^0-9]+)"; # Clean up date entry for standardised persondata date format sub clean_date { my $d = shift; # Put spaces between wikilinks $d =~ s/([\]])([\[])/$1 $2/g; # Deal with piped links $d =~ s/[\[]{2}([^\]]*)[|](.*)[\]]{2}/$2/g; # Remove square brackets $d =~ s/[\[\]]//g; # Deal with templates $d =~ s/{{([Bb]irth|[Dd]eath) date( and age)?\s?\|\s?(\d{1,4})\|\s?(\d{1,2})\|\s?(\d{1,2})(.*)/$3-$4-$5/g; # DD.MM.YYYY => DD MMM YYYY if ($d =~ /^0?(\d{1,2})\.(\d{1,2})\.(\d{1,4})$/) { $month = int $2; $d = "$1 ".$monthname{$month}." $3"; } # YYYY-MM-DD => DD MMM YYYY if ($d =~ /^(\d{1,4})-(\d{1,2})-0?(\d{1,2})$/) { $month = int $2; $d = "$3 ".$monthname{$month}." $1"; } # AD/CE is implicit $d =~ s/A\. ?D\.//; # TODO: doesn't completely work?? e.g. Libanios $d =~ s/A ?D//; $d =~ s/\bC\. ?E\.//; $d =~ s/\bC ?E//; # Remove trailing punctuation $d =~ s/(,|\||;|=)$//; # Remove HTML comments $d =~ s///; # Add forgotten spaces $d =~ s/([a-z])(\d)/$1 $2/g; $d =~ s/(\d)([a-z])/$1 $2/g; $d =~ s/\.([^ ])/. $1/g; #Remove spaces from ordinals $d =~ s/(\d) (th|st|nd|rd|s\b)/$1$2/g; # remove double spaces $d =~ s/\s+/ /g; # remove spaces at beginning and end $d =~ s/^\s+//; $d =~ s/\s+$//; # Fields with only question marks => "unknown" $d =~ s/^\?$/unknown/; # Uniform capitalisation $d =~ s/Unknown/unknown/g; $d =~ s/After/after/g; # question marks should always have brackets and a space before them $d =~ s/\(\?\)/\?/g; $d =~ s/([^ ])\?+/$1 \?/g; $d =~ s/\?/(\?)/g; # Instead of a question mark at the end, 'probably' at the beginning $d =~ s/^(.+) \(\?\)$/probably $1/; $d =~ s/^\(\?\)$/probably/; # Slash without space $d =~ s/ ?\/ ?/\//g; # Remove bolding/italics $d =~ s/\'{2,5}//g; # Shortened month names $d =~ s/Jan[\.\s]+/January /; $d =~ s/Feb[\.\s]+/February /; $d =~ s/Mar[\.\s]+/March /; $d =~ s/Apr[\.\s]+/April /; $d =~ s/Jun[\.\s]+/June /; $d =~ s/Jul[\.\s]+/July /; $d =~ s/Aug[\.\s]+/August /; $d =~ s/Sep[\.\s]+/September /; $d =~ s/Sept[\.\s]+/September /; $d =~ s/Oct[\.\s]+/October /; $d =~ s/Nov[\.\s]+/November /; $d =~ s/Dec[\.\s]+/December /; # Write out "Century" $d =~ s/C\./century/g; # "End of the 5th century" => "End 5th century" (simpler) $d =~ s/ of the//; $d =~ s/(approx\.?|[Cc]irca\.?\s?|\bca?[\.\s]+|about|around|~)/circa /; $d =~ s/([Pp]ossibly|[Pp]robably)/probably/g; $d =~ s/([Ss]till )?[Ll]iving( [Pp]erson)?//; $d =~ s/[Nn]\/?[Aa]//; $d =~ s/[Nn]ot [Aa]pplicable//; # Remove double spaces $d =~ s/\s+/ /g; #----------Error fixing--------- # Other typing errors #$d =~ s/^Um/um/; #$d =~ s/chr/Chr/; return $d; } sub ilog10 { $x=shift; return int log($x) / log(10); } my @fields = ( "day", "month", "year", "century", "decade", "year1", "year2", "note" ); sub parse_date { $date = shift; local %d = (); if (! $date ) { $d{"note"} = ""; # Normal date entry } elsif ($date =~ /^$PREFIX?(\d+ )?$MONTH,? $YEAR( BCE?\.?)?$/) { $d{"note"} = $1 if defined $1; $d{"day"} = substr($2,0,-1) if defined $2; if (defined $3) { $d{"month"} = $months{$3}; } $d{"year"} = int $4; $d{"year"} = -$d{"year"} if defined $5; # B.C. } elsif ($date =~ /^$PREFIX?$MONTH (\d+)?,?\s?$YEAR( BCE?\.?)?$/) { $d{"note"} = $1 if defined $1; $d{"day"} = substr($3,0) if defined $3; if (defined $2) { $d{"month"} = $months{$2}; } $d{"year"} = int $4; $d{"year"} = -$d{"year"} if defined $5; # B.C. } elsif ($date =~ /^$PREFIX?$YEAR( BCE?\.?)?$/) { $d{"note"} = trim($1) if defined $1; $d{"year"} = int $2; $d{"year"} = -$d{"year"} if defined $3; # B.C. # Century #} elsif ($date =~ /^(around |probably |[Bb]eginning |[Mm]iddle |[Ee]nd )?(\d{1,2})\. [Cc]entury( BCE?\.?)?$/) { } elsif ($date =~ /^$PREFIX?(\d{1,2})(\.|th|st|nd|rd) [Cc]entury( BCE?\.?)?$/) { $d{"note"} = trim($1) if defined $1; $d{"century"} = $2; $d{"century"} = -$d{"century"} if defined $4; # Decade } elsif ($date =~ /^(\d{1,4}) ?'?s( BCE?\.?)?$/) { $d{"decade"} = (int $1 / 10) * 10; $d{"decade"} = -$d{"decade"} if defined $3; $d{"century"} = int ($d{"decade"} / 10) + 1; } elsif ($date =~ /^$PREFIX?$YEAR or $YEAR$/) { # TODO: or != until, TODO: B.C. $d{"note"} = trim($1) if defined $1; $d{"year1"} = int $2; $d{"year2"} = int $3; } elsif ($date =~ /^$PREFIX?$YEAR\/(\d{1,4})$/) { # TODO: B.C. $d{"note"} = trim($1) if defined $1; $d{"year1"} = int $2; $d{"year2"} = int $3; # Example: 1632/33 => 1632/1633 $c = ilog10($d{"year1"}) - ilog10($d{"year2"}); if ( $c < 0 ) { # Errr $d{"year1"} = ""; $d{"year2"} = ""; $d{"note"} = $date; } else { $d{"year2"} = substr($d{"year1"},0,$c) . $d{"year2"}; } # Interval of several years # TODO: not tested! } elsif ( $date =~ /^$PREFIX?between $YEAR( and )$YEAR( BCE?\.?)?$/ or $date =~ /^$PREFIX?$YEAR( to |-)$YEAR$( BCE?\.?)?/ ) { $d{"note"} = trim($1) if defined $1; $d{"year1"} = int $2; $d{"year2"} = int $4; if (defined $5) { $tmp = $d{"year1"}; $d{"year1"} = -$d{"year2"}; $d{"year2"} = -$tmp; } #} elsif ($date =~ /^$PREFIX?$YEAR( to |-)$YEAR$/) { # $d{"note"} = trim($1) if defined $1; # $d{"year1"} = int $2; # $d{"year2"} = int $4; } else { $d{"note"} = $date; } if (defined $d{"year1"} and defined $d{"year2"}) { # if years need to be switched #if ($d{"year1"} > $d{"year2"}) { # ($d{"year1"}, $d{"year2"}) = ($d{"year2"}, $d{"year1"}); #} # if century can be defined if ( (int $d{"year1"} / 100) eq (int $d{"year2"} / 100)) { $d{"century"} = (int $d{"year1"} / 100) + 1; } # if decade can be defined if ( $d{"century"} and (int $d{"year1"} / 10) eq (int $d{"year2"} / 10)) { $d{"decade"} = (int $d{"year1"} / 10) * 10; } } # calculate decade and century if (defined $d{"year"}) { $d{"decade"} = (int $d{"year"} / 10) * 10; } # calculate century if (defined $d{"decade"}) { if( $d{"decade"} != 0 ){ $d{"century"} = (int $d{"decade"} / 100) + (int $d{"decade"} / abs $d{"decade"}); } else{ if (defined $d{"year"}) { $d{"century"} = (int $d{"decade"} / 100) + (int $d{"year"} / abs $d{"year"}); } else{ } } } # Initialise undefined fields foreach $f (@fields) { $d{$f} = if not defined $d{$f}; } return %d; } sub parse_location { $p = trim(shift); # a or b <- so what is it?! # ... # ..., ... # ... , ... # .../ ... <- prefereably not like this if ($p =~ /^\[\[([^\]]+)\]\].*?$/) { $a = $1; $a =~ s/\|.*$//; #print "$p|$a\n"; return $a; } else { #print "!$p\n"; return ""; } } #For PND number (assigned to German-speaking authors). #Small number of articles on en wiki have this sub parse_pnd { $p = trim(shift); $pnd_nr = ""; $pnd_date = ""; if ($p =~ /^([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9X])/) { $check = $9; $check = 10 if($check eq "X"); # pruefziffer if ( ((2*$1+3*$2+4*$3+5*$4+6*$5+7*$6+8*$7+9*$8) % 11) eq $check ) { # TODO: Number range 10000000 bis 14999999 if ($1 == "1" && $2>=0 && $2<=4) { $pnd_nr = "$1$2$3$4$5$6$7$8$9"; } } } if ($p =~ /(\d){1,2}\.(\d{1,2})\.(\d\d\d\d)/) { $pnd_date = "$3-$2-$1"; } return ($pnd_nr, $pnd_date); } sub unbracket { $p = shift; # Insert missing spaces between wikilinks $p =~ s/([\]])([\[])/$1 $2/g; # remove square brackets $p =~ s/[\[\]]//g; return $p; } sub trim { $p = shift; if ($p) { # remove empty spaces at beginning and end $p =~ s/^\s+//; $p =~ s/\s+$//; } return $p; } ########################## my @pd = (); while(<>) { $line = $_; @pd = split("\t",$line); chop($pd[-1]); # remove end-of-line $pd_transformed[0] = trim($pd[0]); # pd_id $pd_transformed[1] = trim($pd[1]); # pd_article $pd_transformed[2] = trim($pd[2]); # pd_name $pd_transformed[3] = trim($pd[3]); # pd_alternative $pd_transformed[4] = trim($pd[4]); # pd_description $pd_transformed[5] = unbracket(trim($pd[5])); # pd_born $pd_transformed[6] = trim($pd[6]); # pd_born_in $pd_transformed[7] = unbracket(trim($pd[7])); # pd_died $pd_transformed[8] = trim($pd[8]); # pd_died_in $pd_transformed[9] = trim($pd[9]); # pd_pnd # extract checked pnd-nr and additional date ($pd_transformed[10], $pd_transformed[11]) = parse_pnd($pd[9]); # pnr_nr, pnd_date if ( trim($pd[2]) =~ /^([^,]+),([^,]+)$/ ) { $pd_transformed[12] = trim($2); # n_given $pd_transformed[13] = trim($1); # n_surname $pd_transformed[14] = ; # n_suffix } elsif ( trim($pd[2]) =~ /^([^,]+),([^,]+),([^,]+)$/ ) { $pd_transformed[12] = trim($2); # n_given $pd_transformed[13] = trim($1); # n_surname $pd_transformed[14] = trim($3); # n_suffix } else { $pd_transformed[12] = ; # n_given $pd_transformed[13] = ; # n_surname $pd_transformed[14] = ; # n_suffix } $pd_transformed[15] = parse_location($pd[6]); # b_place $pd_transformed[16] = parse_location($pd[8]); # d_place %born = parse_date(clean_date($pd[5])); $pd_transformed[17] = $born{"day"}; # b_day $pd_transformed[18] = $born{"month"}; # b_month $pd_transformed[19] = $born{"year"}; # b_year $pd_transformed[20] = $born{"decade"}; # b_decade $pd_transformed[21] = $born{"century"}; # b_century $pd_transformed[22] = $born{"year1"}; # b_year1 $pd_transformed[23] = $born{"year2"}; # b_year2 $pd_transformed[24] = $born{"note"}; # b_note %died = parse_date(clean_date($pd[7])); $pd_transformed[25] = $died{"day"}; # d_day $pd_transformed[26] = $died{"month"}; # d_month $pd_transformed[27] = $died{"year"}; # d_year $pd_transformed[28] = $died{"decade"}; # d_decade $pd_transformed[29] = $died{"century"}; # d_century $pd_transformed[30] = $died{"year1"}; # d_year1 $pd_transformed[31] = $died{"year2"}; # d_year2 $pd_transformed[32] = $died{"note"}; # d_note print join("\t",@pd_transformed) . "\n"; }