using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Text.RegularExpressions; namespace RobvanderWoude { internal class WPD2Txt { static readonly string progver = "1.00"; static int Main( string[] args ) { string wpfile =string.Empty; Encoding encoding = null; #region Parse Command Line if ( args.Length == 0 || args.Length > 2 ) { return ShowHelp( ); } foreach ( string arg in args ) { if ( arg[0] == '/' ) { if ( arg == "/?" ) { return ShowHelp( ); } else if ( arg.ToUpper( ) == "/E" ) { return ListEncodings( ); } else { return ShowHelp( "Invalid command line switch {0}", arg ); } } else { if ( string.IsNullOrWhiteSpace( wpfile ) ) { wpfile = arg; if ( !File.Exists( wpfile ) ) { return ShowHelp( "File \"{0}\" not found", wpfile ); } if ( Path.GetExtension( wpfile ).ToLower( ) != ".wpd" ) { return ShowHelp( "This program can extract text from .WPD files only" ); } } else if ( encoding == null ) { encoding = GetEncoding( arg ); if ( encoding == null ) { return ShowHelp( "Invalid encoding \"{0}\"", args[1] ); } } else { return ShowHelp( "Too many command line arguments" ); } } } if ( string.IsNullOrWhiteSpace( wpfile ) ) { return ShowHelp( ); } #endregion Parse Command Line #region Extract Text string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 ); // Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader! Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase ); wpcontent = regex.Replace( wpcontent, "" ); string plaintext = ExtractText( wpcontent ); plaintext = ConvertStrayCarriageReturns( plaintext ); #endregion Extract Text #region Display Text if ( encoding == null ) { // send text to console using default output encoding Console.WriteLine( plaintext ); } else { // temporarily change output encoding and send text to console Encoding oldencoding = Console.OutputEncoding; Console.OutputEncoding = encoding; Console.WriteLine( plaintext ); Console.OutputEncoding = oldencoding; } #endregion Display Text return 0; } static string ConvertStrayCarriageReturns( string text ) { // convert stray carriage returns to carriage return/linefeed pairs // search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n) Regex regex = new Regex( "\r(?!\n)" ); // replace each matching stray carriage return by a carriage return/linefeed pair text = regex.Replace( text, Environment.NewLine ); return text; } static string ExtractText( string rawtext ) { // WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect // Modified for spaces, linefeeds and e acute by yours truly // More modifications are required for accented characters string extractedtext = string.Empty; bool skip = false; int resume = -1; foreach ( char c in rawtext ) { int i = (int)c; if ( !skip ) { if ( i == 63 || i == 128 || i == 160 || i == 65533 ) { extractedtext += ' '; } else if ( i >= 169 && i != 172 && i <= 174 ) { extractedtext += '-'; } else if ( i == 10 || i == 13 || i == 208 ) { extractedtext += Environment.NewLine; } else if ( i >= 192 && i <= 236 ) { skip = true; resume = i; } else if ( i == 15 ) { extractedtext += (char)233; } else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) ) { // control characters, ignore } else { extractedtext += c; } } else if ( skip && i == resume ) { skip = false; resume = -1; } } return extractedtext; } static Encoding GetEncoding( string myencoding ) { if ( string.IsNullOrEmpty( myencoding ) ) { return null; } // Get a list of available encodings EncodingInfo[] encodings = Encoding.GetEncodings( ); // Try correctly spelled encodings first foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // No direct match found, try again, ignoring dashes foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, try codepages foreach ( EncodingInfo encoding in encodings ) { if ( encoding.CodePage.ToString( ) == myencoding ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, giving up return null; } static int ListEncodings( ) { try { Console.Clear( ); } catch { // Console.Clear( ) throws an IO exception if the output is redirected } int columnwidth = 8; EncodingInfo[] allencodings = Encoding.GetEncodings( ); List allencodingnames = new List( ); foreach ( EncodingInfo enc in allencodings ) { allencodingnames.Add( enc.Name ); } allencodingnames.Sort( ); foreach ( string enc in allencodingnames ) { columnwidth = Math.Max( columnwidth, enc.Length ); } Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" ); Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" ); foreach ( string enc in allencodingnames ) { Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage ); } return 0; } static int ShowHelp( params string[] errmsg ) { #region Error Message if ( errmsg.Length > 0 ) { List errargs = new List( errmsg ); errargs.RemoveAt( 0 ); Console.Error.WriteLine( ); Console.ForegroundColor = ConsoleColor.Red; Console.Error.Write( "ERROR:\t" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) ); Console.ResetColor( ); } #endregion Error Message #region Help Text /* WPD2Txt.exe, Version 1.00 Return plain text content of a WordPerfect file without requiring WordPerfect Usage: WPD2Txt.exe wpfile [ encoding ] or: WPD2Txt.exe /E Where: wpfile is the path of the WordPerfect file to be read (no wildcards, only .wpd extension allowed) encoding is the output encoding, e.g. UTF-8 to preserve Unicode characters, or IBM437 to convert Unicode doublequotes to ASCII /E list all available encodings Notes: This program is far from perfect, extracted text still contains a lot of "garbage" and most accented characters will be lost; if you have WordPerfect available, better use that to extract text. If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 1 in case of errors, 0 on success. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Display Help Text Console.Error.WriteLine( ); Console.Error.WriteLine( "WPD2Txt.exe, Version {0}", progver ); Console.Error.WriteLine( "Return plain text content of a WordPerfect file without requiring WordPerfect" ); Console.Error.WriteLine( ); Console.Error.Write( "Usage: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "WPD2Txt.exe wpfile [ encoding ]" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "or: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "WPD2Txt.exe /E" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "Where: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "wpfile" ); Console.ResetColor( ); Console.Error.WriteLine( " is the path of the WordPerfect file to be read" ); Console.Error.WriteLine( " (no wildcards, only .wpd extension allowed)" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " encoding" ); Console.ResetColor( ); Console.Error.Write( " is the output encoding, e.g. " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "UTF-8" ); Console.ResetColor( ); Console.Error.WriteLine( " to preserve" ); Console.Error.Write( " Unicode characters, or " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "IBM437" ); Console.ResetColor( ); Console.Error.WriteLine( " to convert Unicode" ); Console.Error.WriteLine( " doublequotes to ASCII" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " /E" ); Console.ResetColor( ); Console.Error.WriteLine( " list all available encodings" ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Notes: This program is far from perfect, extracted text still contains" ); Console.Error.WriteLine( " a lot of \"garbage\" and most accented characters will be lost; if" ); Console.Error.WriteLine( " you have WordPerfect available, better use that to extract text." ); Console.Error.WriteLine( " If the specified encoding does not match any available encoding" ); Console.Error.WriteLine( " name, the program will try again, ignoring dashes; if that does" ); Console.Error.WriteLine( " not provide a match, the program will try matching the specified" ); Console.Error.WriteLine( " encoding with the available encodings' codepages." ); Console.Error.WriteLine( " This program requires .NET 4.5." ); Console.Error.WriteLine( " Return code (\"errorlevel\") 1 in case of errors, 0 on success." ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Written by Rob van der Woude" ); Console.Error.WriteLine( "https://www.robvanderwoude.com" ); #endregion Display Help Text return 1; } } }