(view source code of wpd2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace RobvanderWoude
{
internal class WPD2Txt
{
static readonly string progver = "1.00";
static int Main( string[] args )
{
string wpfile =string.Empty;
Encoding encoding = null;
#region Parse Command Line
if ( args.Length == 0 || args.Length > 2 )
{
return ShowHelp( );
}
foreach ( string arg in args )
{
if ( arg[0] == '/' )
{
if ( arg == "/?" )
{
return ShowHelp( );
}
else if ( arg.ToUpper( ) == "/E" )
{
return ListEncodings( );
}
else
{
return ShowHelp( "Invalid command line switch {0}", arg );
}
}
else
{
if ( string.IsNullOrWhiteSpace( wpfile ) )
{
wpfile = arg;
if ( !File.Exists( wpfile ) )
{
return ShowHelp( "File \"{0}\" not found", wpfile );
}
if ( Path.GetExtension( wpfile ).ToLower( ) != ".wpd" )
{
return ShowHelp( "This program can extract text from .WPD files only" );
}
}
else if ( encoding == null )
{
encoding = GetEncoding( arg );
if ( encoding == null )
{
return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
}
}
else
{
return ShowHelp( "Too many command line arguments" );
}
}
}
if ( string.IsNullOrWhiteSpace( wpfile ) )
{
return ShowHelp( );
}
#endregion Parse Command Line
#region Extract Text
string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 );
// Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader!
Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase );
wpcontent = regex.Replace( wpcontent, "" );
string plaintext = ExtractText( wpcontent );
plaintext = ConvertStrayCarriageReturns( plaintext );
#endregion Extract Text
#region Display Text
if ( encoding == null )
{
// send text to console using default output encoding
Console.WriteLine( plaintext );
}
else
{
// temporarily change output encoding and send text to console
Encoding oldencoding = Console.OutputEncoding;
Console.OutputEncoding = encoding;
Console.WriteLine( plaintext );
Console.OutputEncoding = oldencoding;
}
#endregion Display Text
return 0;
}
static string ConvertStrayCarriageReturns( string text )
{
// convert stray carriage returns to carriage return/linefeed pairs
// search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)
Regex regex = new Regex( "\r(?!\n)" );
// replace each matching stray carriage return by a carriage return/linefeed pair
text = regex.Replace( text, Environment.NewLine );
return text;
}
static string ExtractText( string rawtext )
{
// WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect
// Modified for spaces, linefeeds and e acute by yours truly
// More modifications are required for accented characters
string extractedtext = string.Empty;
bool skip = false;
int resume = -1;
foreach ( char c in rawtext )
{
int i = (int)c;
if ( !skip )
{
if ( i == 63 || i == 128 || i == 160 || i == 65533 )
{
extractedtext += ' ';
}
else if ( i >= 169 && i != 172 && i <= 174 )
{
extractedtext += '-';
}
else if ( i == 10 || i == 13 || i == 208 )
{
extractedtext += Environment.NewLine;
}
else if ( i >= 192 && i <= 236 )
{
skip = true;
resume = i;
}
else if ( i == 15 )
{
extractedtext += (char)233;
}
else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) )
{
// control characters, ignore
}
else
{
extractedtext += c;
}
}
else if ( skip && i == resume )
{
skip = false;
resume = -1;
}
}
return extractedtext;
}
static Encoding GetEncoding( string myencoding )
{
if ( string.IsNullOrEmpty( myencoding ) )
{
return null;
}
// Get a list of available encodings
EncodingInfo[] encodings = Encoding.GetEncodings( );
// Try correctly spelled encodings first
foreach ( EncodingInfo encoding in encodings )
{
if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
{
return Encoding.GetEncoding( encoding.CodePage );
}
}
// No direct match found, try again, ignoring dashes
foreach ( EncodingInfo encoding in encodings )
{
if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
{
return Encoding.GetEncoding( encoding.CodePage );
}
}
// Still no match, try codepages
foreach ( EncodingInfo encoding in encodings )
{
if ( encoding.CodePage.ToString( ) == myencoding )
{
return Encoding.GetEncoding( encoding.CodePage );
}
}
// Still no match, giving up
return null;
}
static int ListEncodings( )
{
try
{
Console.Clear( );
}
catch
{
// Console.Clear( ) throws an IO exception if the output is redirected
}
int columnwidth = 8;
EncodingInfo[] allencodings = Encoding.GetEncodings( );
List<string> allencodingnames = new List<string>( );
foreach ( EncodingInfo enc in allencodings )
{
allencodingnames.Add( enc.Name );
}
allencodingnames.Sort( );
foreach ( string enc in allencodingnames )
{
columnwidth = Math.Max( columnwidth, enc.Length );
}
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" );
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" );
foreach ( string enc in allencodingnames )
{
Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage );
}
return 0;
}
static int ShowHelp( params string[] errmsg )
{
#region Error Message
if ( errmsg.Length > 0 )
{
List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
}
#endregion Error Message
#region Help Text
/*
WPD2Txt.exe, Version 1.00
Return plain text content of a WordPerfect file without requiring WordPerfect
Usage: WPD2Txt.exe wpfile [ encoding ]
or: WPD2Txt.exe /E
Where: wpfile is the path of the WordPerfect file to be read
(no wildcards, only .wpd extension allowed)
encoding is the output encoding, e.g. UTF-8 to preserve
Unicode characters, or IBM437 to convert Unicode
doublequotes to ASCII
/E list all available encodings
Notes: This program is far from perfect, extracted text still contains
a lot of "garbage" and most accented characters will be lost; if
you have WordPerfect available, better use that to extract text.
If the specified encoding does not match any available encoding
name, the program will try again, ignoring dashes; if that does
not provide a match, the program will try matching the specified
encoding with the available encodings' codepages.
This program requires .NET 4.5.
Return code ("errorlevel") 1 in case of errors, 0 on success.
Written by Rob van der Woude
https://www.robvanderwoude.com
*/
#endregion Help Text
#region Display Help Text
Console.Error.WriteLine( );
Console.Error.WriteLine( "WPD2Txt.exe, Version {0}", progver );
Console.Error.WriteLine( "Return plain text content of a WordPerfect file without requiring WordPerfect" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "WPD2Txt.exe wpfile [ encoding ]" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "or: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "WPD2Txt.exe /E" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "wpfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the WordPerfect file to be read" );
Console.Error.WriteLine( " (no wildcards, only .wpd extension allowed)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " encoding" );
Console.ResetColor( );
Console.Error.Write( " is the output encoding, e.g. " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "UTF-8" );
Console.ResetColor( );
Console.Error.WriteLine( " to preserve" );
Console.Error.Write( " Unicode characters, or " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "IBM437" );
Console.ResetColor( );
Console.Error.WriteLine( " to convert Unicode" );
Console.Error.WriteLine( " doublequotes to ASCII" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /E" );
Console.ResetColor( );
Console.Error.WriteLine( " list all available encodings" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Notes: This program is far from perfect, extracted text still contains" );
Console.Error.WriteLine( " a lot of \"garbage\" and most accented characters will be lost; if" );
Console.Error.WriteLine( " you have WordPerfect available, better use that to extract text." );
Console.Error.WriteLine( " If the specified encoding does not match any available encoding" );
Console.Error.WriteLine( " name, the program will try again, ignoring dashes; if that does" );
Console.Error.WriteLine( " not provide a match, the program will try matching the specified" );
Console.Error.WriteLine( " encoding with the available encodings' codepages." );
Console.Error.WriteLine( " This program requires .NET 4.5." );
Console.Error.WriteLine( " Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Text
return 1;
}
}
}
page last modified: 2024-04-16; loaded in 0.0096 seconds