Split-Wikipedia.ps1
| <#PSScriptInfo .VERSION 1.6 .GUID 6c8ec05e-4d42-465b-9a30-2bbdcec289d3 .AUTHOR Lee Holmes #> <# .DESCRIPTION Splits a Wikipedia XML database dump into text-only articles. Articles are placed in an "Articles" directory, then again split into subdirectories with 5,000 articles each. .EXAMPLE PS > Invoke-WebRequest https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -Outfile enwiki-latest-pages-articles.xml.bz2 PS > bzip2 -d enwiki-latest-pages-articles.xml.bz2 PS > Split-Wikipedia enwiki-latest-pages-articles.xml .NOTES Processing of Wikipedia's 60GB XML will take about 7 hours. #> param( [CmdletBinding()] $Path ) function GetSafeFilename { param( $BasePath = ".", $Text, $Extension = ".txt" ) ## Remove invalid filesystem characters $invalidChars = [IO.Path]::GetInvalidFileNameChars() $invalidCharsRegex = "[" + (-join ($invalidChars | % { [Regex]::Escape($_) })) + "]" $baseFilename = $Text -replace $invalidCharsRegex,'_' ## Avoid reserved device names $reservedDeviceNames = -split "CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9" if($baseFilename -in $reservedDeviceNames) { $baseFilename = "_" + $baseFilename } ## Avoid path length issues $baseFilename = $baseFilename.Substring(0, [Math]::Min(50, $baseFilename.Length)) ## Avoid existing files $counter = 1 $fileName = $baseFilename + $Extension while(Test-Path (Join-Path $BasePath $fileName)) { $filename = $baseFilename + "_${counter}${Extension}" $counter++ } # Emit the result $fileName.Trim() } $null = New-Item -Type Directory articles $basePath = "" $articleCounter = 1 $currentTitle = '' $currentArticle = New-Object System.Text.StringBuilder $capturing = $false $capturingTitle = $false ## Taken from enwiki-20160601, which had an average article size of 3456.7 bytes. $estimatedArticleCount = (Get-Item $Path).Length / 3456.79917342699 $xmlReader = [System.Xml.XmlReader]::Create( (Resolve-Path $Path) ) while($xmlReader.Read()) { switch ($xmlReader.NodeType) { 'Element' { if($xmlReader.Name -eq 'Title') { $capturingTitle = $true } elseif($xmlReader.Name -eq 'Text') { $capturing = $true } } 'Text' { if($capturingTitle) { $currentTitle = $xmlReader.Value $capturingTitle = $false } elseif($capturing) { $null = $currentArticle.Append($xmlReader.Value) } } 'EndElement' { if($xmlReader.Name -eq 'Page') { if(($articleCounter % 1000) -eq 0) { Write-Progress "Processing article ${articleCounter}: $currentTitle" -PercentComplete ($articleCounter * 100 / $estimatedArticleCount) } if(($articleCounter % 5000) -eq 0) { $basePath = $null } $output = $currentArticle.ToString() do { $foundmatch = $false ## Remove tables if($output -match "(?s){\|[^{}]+?\|}") { $foundmatch = $true $output = $output -replace "(?s){\|[^{}]+?\|}","" } ## Remove {{cite ... }} and subheadings if($output -match "(?s){{[^{}]+?}}") { $foundmatch = $true $output = $output -replace "(?s){{[^{}]+?}}","" } } while($foundmatch) ## Remove <ref some article></ref> $output = $output -replace "(?s)<ref.*?</ref>","" $output = $output -replace "(?s)<ref.*?/>","" ## Remove <!-- Some comment ->> $output = $output -replace "(?s)<!--.*?>","" ## Replace [[Article Reference|Description]] with Description $output = $output -replace '(?s)\[\[([^\[\]]+)\|([^\[\]]+)\]\]','$2' # Replace [Article Reference] with Article Reference $output = $output -replace '(?s)\[\[([^\[\|\]]+)\]\]','$1' # Remove [[File ... ]] $output = $output -replace '(?s)\[\[File.*?\]\]','' ## Remove everything after "References" $output = $output -replace "(?s)==References.*","" ## Normalize line endings, and remove extraneous extra ## newlines $output = $output -replace "\n","`r`n" $output = $output -replace "(`r`n){3,}","`r`n" ## Clean up sequences of single quotes like '''Quoted''' $output = $output -replace "'{2,}",'"' ## Final cleanup $output = $output.Trim() if( ## Skip articles that just redirect to other articles ($output -notmatch "^#REDIRECT") -and ## Skip very small articles ($output.Length -gt 500) -and ## Skip file metadata articles ($currentTitle -notmatch "^FILE:") -and ## Skip Wikipedia metadata articles ($currentTitle -notmatch "^Wikipedia:") -and ## Skip "category", "template", or "draft" articles ($currentTitle -notmatch "^CATEGORY:|^TEMPLATE:|^DRAFT") -and ## Skip "articles for deletion" ($currentTitle -notmatch "Articles for deletion") -and ## Skip "Spam link reports" ($currentTitle -notmatch "Spam/LinkReports") ) { if(-not $basePath) { $basePath = GetSafeFileName -BasePath "articles" -Text $currentTitle -Extension "" $null = New-Item -Type Directory -Path (Join-Path articles $basePath) } $outputFile = GetSafeFilename -BasePath (Join-Path articles $basePath) $currentTitle Set-Content -LiteralPath "articles\$basePath\$outputFile" -Value $output } $null = $currentArticle.Clear() $articleCounter++ } elseif($xmlReader.Name -eq 'Text') { $capturing = $false } } } } |