Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for wpd2txt.cs

(view source code of wpd2txt.cs as plain text)

  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6.  
  7.  
  8. namespace RobvanderWoude
  9. {
  10. 	internal class WPD2Txt
  11. 	{
  12. 		static readonly string progver = "1.00";
  13.  
  14.  
  15. 		static int Main( string[] args )
  16. 		{
  17. 			string wpfile =string.Empty;
  18. 			Encoding encoding = null;
  19.  
  20.  
  21. 			#region Parse Command Line
  22.  
  23. 			if ( args.Length == 0 || args.Length > 2 )
  24. 			{
  25. 				return ShowHelp( );
  26. 			}
  27.  
  28. 			foreach ( string arg in args )
  29. 			{
  30. 				if ( arg[0] == '/' )
  31. 				{
  32. 					if ( arg == "/?" )
  33. 					{
  34. 						return ShowHelp( );
  35. 					}
  36. 					else if ( arg.ToUpper( ) == "/E" )
  37. 					{
  38. 						return ListEncodings( );
  39. 					}
  40. 					else
  41. 					{
  42. 						return ShowHelp( "Invalid command line switch {0}", arg );
  43. 					}
  44. 				}
  45. 				else
  46. 				{
  47. 					if ( string.IsNullOrWhiteSpace( wpfile ) )
  48. 					{
  49. 						wpfile = arg;
  50. 						if ( !File.Exists( wpfile ) )
  51. 						{
  52. 							return ShowHelp( "File \"{0}\" not found", wpfile );
  53. 						}
  54. 						if ( Path.GetExtension( wpfile ).ToLower( ) != ".wpd" )
  55. 						{
  56. 							return ShowHelp( "This program can extract text from .WPD files only" );
  57. 						}
  58. 					}
  59. 					else if ( encoding == null )
  60. 					{
  61. 						encoding = GetEncoding( arg );
  62. 						if ( encoding == null )
  63. 						{
  64. 							return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
  65. 						}
  66. 					}
  67. 					else
  68. 					{
  69. 						return ShowHelp( "Too many command line arguments" );
  70. 					}
  71. 				}
  72. 			}
  73.  
  74. 			if ( string.IsNullOrWhiteSpace( wpfile ) )
  75. 			{
  76. 				return ShowHelp( );
  77. 			}
  78.  
  79. 			#endregion Parse Command Line
  80.  
  81.  
  82. 			#region Extract Text
  83.  
  84. 			string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 );
  85.  
  86. 			// Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader!
  87. 			Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase );
  88. 			wpcontent = regex.Replace( wpcontent, "" );
  89.  
  90. 			string plaintext = ExtractText( wpcontent );
  91.  
  92. 			plaintext = ConvertStrayCarriageReturns( plaintext );
  93.  
  94. 			#endregion Extract Text
  95.  
  96.  
  97. 			#region Display Text
  98.  
  99. 			if ( encoding == null )
  100. 			{
  101. 				// send text to console using default output encoding
  102. 				Console.WriteLine( plaintext );
  103. 			}
  104. 			else
  105. 			{
  106. 				// temporarily change output encoding and send text to console
  107. 				Encoding oldencoding = Console.OutputEncoding;
  108. 				Console.OutputEncoding = encoding;
  109. 				Console.WriteLine( plaintext );
  110. 				Console.OutputEncoding = oldencoding;
  111. 			}
  112.  
  113. 			#endregion Display Text
  114.  
  115.  
  116. 			return 0;
  117. 		}
  118.  
  119.  
  120. 		static string ConvertStrayCarriageReturns( string text )
  121. 		{
  122. 			// convert stray carriage returns to carriage return/linefeed pairs
  123. 			// search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)
  124. 			Regex regex = new Regex( "\r(?!\n)" );
  125. 			// replace each matching stray carriage return by a carriage return/linefeed pair
  126. 			text = regex.Replace( text, Environment.NewLine );
  127. 			return text;
  128. 		}
  129.  
  130.  
  131. 		static string ExtractText( string rawtext )
  132. 		{
  133. 			// WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect
  134. 			// Modified for spaces, linefeeds and e acute by yours truly
  135. 			// More modifications are required for accented characters
  136. 			string extractedtext = string.Empty;
  137. 			bool skip = false;
  138. 			int resume = -1;
  139. 			foreach ( char c in rawtext )
  140. 			{
  141. 				int i = (int)c;
  142. 				if ( !skip )
  143. 				{
  144. 					if ( i == 63 || i == 128 || i == 160 || i == 65533 )
  145. 					{
  146. 						extractedtext += ' ';
  147. 					}
  148. 					else if ( i >= 169 && i != 172 && i <= 174 )
  149. 					{
  150. 						extractedtext += '-';
  151. 					}
  152. 					else if ( i == 10 || i == 13 || i == 208 )
  153. 					{
  154. 						extractedtext += Environment.NewLine;
  155. 					}
  156. 					else if ( i >= 192 && i <= 236 )
  157. 					{
  158. 						skip = true;
  159. 						resume = i;
  160. 					}
  161. 					else if ( i == 15 )
  162. 					{
  163. 						extractedtext += (char)233;
  164. 					}
  165. 					else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) )
  166. 					{
  167. 						// control characters, ignore
  168. 					}
  169. 					else
  170. 					{
  171. 						extractedtext += c;
  172. 					}
  173. 				}
  174. 				else if ( skip && i == resume )
  175. 				{
  176. 					skip = false;
  177. 					resume = -1;
  178. 				}
  179. 			}
  180. 			return extractedtext;
  181. 		}
  182.  
  183.  
  184. 		static Encoding GetEncoding( string myencoding )
  185. 		{
  186. 			if ( string.IsNullOrEmpty( myencoding ) )
  187. 			{
  188. 				return null;
  189. 			}
  190. 			// Get a list of available encodings
  191. 			EncodingInfo[] encodings = Encoding.GetEncodings( );
  192. 			// Try correctly spelled encodings first
  193. 			foreach ( EncodingInfo encoding in encodings )
  194. 			{
  195. 				if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
  196. 				{
  197. 					return Encoding.GetEncoding( encoding.CodePage );
  198. 				}
  199. 			}
  200. 			// No direct match found, try again, ignoring dashes
  201. 			foreach ( EncodingInfo encoding in encodings )
  202. 			{
  203. 				if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
  204. 				{
  205. 					return Encoding.GetEncoding( encoding.CodePage );
  206. 				}
  207. 			}
  208. 			// Still no match, try codepages
  209. 			foreach ( EncodingInfo encoding in encodings )
  210. 			{
  211. 				if ( encoding.CodePage.ToString( ) == myencoding )
  212. 				{
  213. 					return Encoding.GetEncoding( encoding.CodePage );
  214. 				}
  215. 			}
  216. 			// Still no match, giving up
  217. 			return null;
  218. 		}
  219.  
  220.  
  221. 		static int ListEncodings( )
  222. 		{
  223. 			try
  224. 			{
  225. 				Console.Clear( );
  226. 			}
  227. 			catch
  228. 			{
  229. 				// Console.Clear( ) throws an IO exception if the output is redirected
  230. 			}
  231. 			int columnwidth = 8;
  232. 			EncodingInfo[] allencodings = Encoding.GetEncodings( );
  233. 			List<string> allencodingnames = new List<string>( );
  234. 			foreach ( EncodingInfo enc in allencodings )
  235. 			{
  236. 				allencodingnames.Add( enc.Name );
  237. 			}
  238. 			allencodingnames.Sort( );
  239. 			foreach ( string enc in allencodingnames )
  240. 			{
  241. 				columnwidth = Math.Max( columnwidth, enc.Length );
  242. 			}
  243. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "Encoding", "CodePage" );
  244. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "========", "========" );
  245. 			foreach ( string enc in allencodingnames )
  246. 			{
  247. 				Console.WriteLine( "{0,-" + columnwidth + "}   {1}", enc, GetEncoding( enc ).CodePage );
  248. 			}
  249. 			return 0;
  250. 		}
  251.  
  252.  
  253. 		static int ShowHelp( params string[] errmsg )
  254. 		{
  255. 			#region Error Message
  256.  
  257. 			if ( errmsg.Length > 0 )
  258. 			{
  259. 				List<string> errargs = new List<string>( errmsg );
  260. 				errargs.RemoveAt( 0 );
  261. 				Console.Error.WriteLine( );
  262. 				Console.ForegroundColor = ConsoleColor.Red;
  263. 				Console.Error.Write( "ERROR:\t" );
  264. 				Console.ForegroundColor = ConsoleColor.White;
  265. 				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
  266. 				Console.ResetColor( );
  267. 			}
  268.  
  269. 			#endregion Error Message
  270.  
  271.  
  272. 			#region Help Text
  273.  
  274. 			/*
  275. 			WPD2Txt.exe,  Version 1.00
  276. 			Return plain text content of a WordPerfect file without requiring WordPerfect
  277.  
  278. 			Usage:    WPD2Txt.exe  wpfile  [ encoding ]
  279.  
  280. 			or:       WPD2Txt.exe  /E
  281.  
  282. 			Where:    wpfile       is the path of the WordPerfect file to be read
  283. 			                       (no wildcards, only .wpd extension allowed)
  284. 			          encoding     is the output encoding, e.g. UTF-8 to preserve
  285. 			                       Unicode characters, or IBM437 to convert Unicode
  286. 			                       doublequotes to ASCII
  287. 			          /E           list all available encodings
  288.  
  289. 			Notes:    This program is far from perfect, extracted text still contains
  290. 			          a lot of "garbage" and most accented characters will be lost; if
  291. 			          you have WordPerfect available, better use that to extract text.
  292. 			          If the specified encoding does not match any available encoding
  293. 			          name, the program will try again, ignoring dashes; if that does
  294. 			          not provide a match, the program will try matching the specified
  295. 			          encoding with the available encodings' codepages.
  296. 			          This program requires .NET 4.5.
  297. 			          Return code ("errorlevel") 1 in case of errors, 0 on success.
  298.  
  299. 			Written by Rob van der Woude
  300. 			https://www.robvanderwoude.com
  301. 			*/
  302.  
  303. 			#endregion Help Text
  304.  
  305.  
  306. 			#region Display Help Text
  307.  
  308. 			Console.Error.WriteLine( );
  309.  
  310. 			Console.Error.WriteLine( "WPD2Txt.exe,  Version {0}", progver );
  311.  
  312. 			Console.Error.WriteLine( "Return plain text content of a WordPerfect file without requiring WordPerfect" );
  313.  
  314. 			Console.Error.WriteLine( );
  315.  
  316. 			Console.Error.Write( "Usage:    " );
  317. 			Console.ForegroundColor = ConsoleColor.White;
  318. 			Console.Error.WriteLine( "WPD2Txt.exe  wpfile  [ encoding ]" );
  319. 			Console.ResetColor( );
  320.  
  321. 			Console.Error.WriteLine( );
  322.  
  323. 			Console.Error.Write( "or:       " );
  324. 			Console.ForegroundColor = ConsoleColor.White;
  325. 			Console.Error.WriteLine( "WPD2Txt.exe  /E" );
  326. 			Console.ResetColor( );
  327.  
  328. 			Console.Error.WriteLine( );
  329.  
  330. 			Console.Error.Write( "Where:    " );
  331. 			Console.ForegroundColor = ConsoleColor.White;
  332. 			Console.Error.Write( "wpfile" );
  333. 			Console.ResetColor( );
  334. 			Console.Error.WriteLine( "       is the path of the WordPerfect file to be read" );
  335.  
  336. 			Console.Error.WriteLine( "                       (no wildcards, only .wpd extension allowed)" );
  337.  
  338. 			Console.ForegroundColor = ConsoleColor.White;
  339. 			Console.Error.Write( "          encoding" );
  340. 			Console.ResetColor( );
  341. 			Console.Error.Write( "     is the output encoding, e.g. " );
  342. 			Console.ForegroundColor = ConsoleColor.White;
  343. 			Console.Error.Write( "UTF-8" );
  344. 			Console.ResetColor( );
  345. 			Console.Error.WriteLine( " to preserve" );
  346.  
  347. 			Console.Error.Write( "                       Unicode characters, or " );
  348. 			Console.ForegroundColor = ConsoleColor.White;
  349. 			Console.Error.Write( "IBM437" );
  350. 			Console.ResetColor( );
  351. 			Console.Error.WriteLine( " to convert Unicode" );
  352.  
  353. 			Console.Error.WriteLine( "                       doublequotes to ASCII" );
  354.  
  355. 			Console.ForegroundColor = ConsoleColor.White;
  356. 			Console.Error.Write( "         /E" );
  357. 			Console.ResetColor( );
  358. 			Console.Error.WriteLine( "            list all available encodings" );
  359.  
  360. 			Console.Error.WriteLine( );
  361.  
  362. 			Console.Error.WriteLine( "Notes:    This program is far from perfect, extracted text still contains" );
  363.  
  364. 			Console.Error.WriteLine( "          a lot of \"garbage\" and most accented characters will be lost; if" );
  365.  
  366. 			Console.Error.WriteLine( "          you have WordPerfect available, better use that to extract text." );
  367.  
  368. 			Console.Error.WriteLine( "          If the specified encoding does not match any available encoding" );
  369.  
  370. 			Console.Error.WriteLine( "          name, the program will try again, ignoring dashes; if that does" );
  371.  
  372. 			Console.Error.WriteLine( "          not provide a match, the program will try matching the specified" );
  373.  
  374. 			Console.Error.WriteLine( "          encoding with the available encodings' codepages." );
  375.  
  376. 			Console.Error.WriteLine( "          This program requires .NET 4.5." );
  377.  
  378. 			Console.Error.WriteLine( "          Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
  379.  
  380. 			Console.Error.WriteLine( );
  381.  
  382. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  383.  
  384. 			Console.Error.WriteLine( "https://www.robvanderwoude.com" );
  385.  
  386. 			#endregion Display Help Text
  387.  
  388.  
  389. 			return 1;
  390. 		}
  391. 	}
  392. }
  393.  

page last modified: 2024-04-16; loaded in 0.0096 seconds