Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for docx2txt.cs

(view source code of docx2txt.cs as plain text)

  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.IO.Compression;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7.  
  8.  
  9. namespace RobvanderWoude
  10. {
  11. 	internal class Docx2Txt
  12. 	{
  13. 		static string progver = "1.01";
  14.  
  15.  
  16. 		static int Main( string[] args )
  17. 		{
  18. 			string document = string.Empty;
  19. 			string docext = string.Empty;
  20. 			Encoding encoding = null;
  21. 			bool usexmlencoding = false;
  22.  
  23. 			#region Parse Command Line
  24.  
  25. 			if ( args.Length == 0 || args.Length > 2 )
  26. 			{
  27. 				return ShowHelp( );
  28. 			}
  29.  
  30. 			foreach ( string arg in args )
  31. 			{
  32. 				if ( arg[0] == '/' )
  33. 				{
  34. 					if ( arg == "/?" )
  35. 					{
  36. 						return ShowHelp( );
  37. 					}
  38. 					else if ( arg.ToUpper( ).StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) )
  39. 					{
  40. 						usexmlencoding = true;
  41. 					}
  42. 					else if ( arg.ToUpper( ) == "/E" )
  43. 					{
  44. 						return ListEncodings( );
  45. 					}
  46. 					else
  47. 					{
  48. 						return ShowHelp( "Invalid command line switch {0}", arg );
  49. 					}
  50. 				}
  51. 				else
  52. 				{
  53. 					if ( string.IsNullOrWhiteSpace( document ) )
  54. 					{
  55. 						document = arg;
  56. 						if ( !File.Exists( document ) )
  57. 						{
  58. 							return ShowHelp( "File \"{0}\" not found", document );
  59. 						}
  60. 						docext = Path.GetExtension( document ).ToLower( );
  61. 						if ( docext != ".docx" && docext != ".odt" )
  62. 						{
  63. 							return ShowHelp( "This program can extract text from .DOCX and .ODT files only" );
  64. 						}
  65. 					}
  66. 					else if ( encoding == null )
  67. 					{
  68. 						encoding = GetEncoding( arg );
  69. 						if ( encoding == null )
  70. 						{
  71. 							return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
  72. 						}
  73. 					}
  74. 					else
  75. 					{
  76. 						return ShowHelp( "Too many command line arguments" );
  77. 					}
  78. 				}
  79. 			}
  80.  
  81. 			if ( string.IsNullOrWhiteSpace( document ) )
  82. 			{
  83. 				return ShowHelp( );
  84. 			}
  85.  
  86. 			#endregion Parse Command Line
  87.  
  88.  
  89. 			#region Extract Text
  90.  
  91. 			string tempfile = Path.GetTempFileName( );
  92. 			string content = string.Empty;
  93. 			bool success = false;
  94.  
  95. 			string contentfile = string.Empty;
  96. 			if ( docext == ".odt" ) // OpenOffice document
  97. 			{
  98. 				contentfile = "content.xml";
  99. 			}
  100. 			else if ( docext == ".docx" ) // MS Office document
  101. 			{
  102. 				contentfile = "document.xml";
  103. 			}
  104. 			if ( !string.IsNullOrWhiteSpace( contentfile ) )
  105. 			{
  106. 				// Open document as ZIP file and extract the XML file containing the text content
  107. 				using ( ZipArchive archive = ZipFile.OpenRead( document ) )
  108. 				{
  109. 					foreach ( ZipArchiveEntry entry in archive.Entries )
  110. 					{
  111. 						if ( entry.Name.ToLower( ) == contentfile )
  112. 						{
  113. 							entry.ExtractToFile( tempfile, true );
  114. 							success = true;
  115. 						}
  116. 					}
  117. 				}
  118. 			}
  119.  
  120. 			if ( success )
  121. 			{
  122. 				// Read the text content from the extracted file
  123. 				StreamReader sr = new StreamReader( tempfile );
  124. 				content = sr.ReadToEnd( ).Trim( "\n\r\t ".ToCharArray( ) );
  125. 				sr.Close( );
  126. 			}
  127.  
  128. 			// Delete the extracted file
  129. 			File.Delete( tempfile );
  130.  
  131. 			#endregion Extract Text
  132.  
  133.  
  134. 			if ( success )
  135. 			{
  136. 				// The first 100 characters of the extracted XML usually contain its encoding;
  137. 				// this encoding will be used if the /D command line switch was used
  138. 				Regex regex = new Regex( " encoding=\"([^\"]+)\"" );
  139. 				string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value;
  140.  
  141. 				#region Cleanup Text
  142.  
  143. 				// insert newlines after headers, list items and paragraphs
  144. 				regex = new Regex( "</(text|w):(h|p)>" );
  145. 				string plaintext = regex.Replace( content, "\n\n" );
  146. 				regex = new Regex( "<w:br/>" );
  147. 				plaintext = regex.Replace( plaintext, "\n\n" );
  148. 				// remove all XML tags
  149. 				regex = new Regex( "<[^>]+>" );
  150. 				plaintext = regex.Replace( plaintext, "" );
  151. 				// convert stray carriage returns to carriage return/linefeed pairs
  152. 				plaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) );
  153.  
  154. 				#endregion Cleanup Text
  155.  
  156.  
  157. 				#region Display Text
  158.  
  159. 				if ( usexmlencoding )
  160. 				{
  161. 					encoding = GetEncoding( xmlencoding );
  162. 				}
  163.  
  164. 				if ( encoding == null )
  165. 				{
  166. 					// send text to console using default output encoding
  167. 					Console.WriteLine( plaintext );
  168. 				}
  169. 				else
  170. 				{
  171. 					// temporarily change output encoding and send text to console
  172. 					Encoding oldencoding = Console.OutputEncoding;
  173. 					Console.OutputEncoding = encoding;
  174. 					Console.WriteLine( plaintext );
  175. 					Console.OutputEncoding = oldencoding;
  176. 				}
  177.  
  178. 				#endregion Display Text
  179.  
  180.  
  181. 				return 0;
  182. 			}
  183.  
  184. 			return ShowHelp( "An error occurred while trying to read \"{0}\"", document );
  185. 		}
  186.  
  187.  
  188. 		static string ConvertStrayCarriageReturns( string text )
  189. 		{
  190. 			// convert stray carriage returns to carriage return/linefeed pairs
  191. 			// search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)
  192. 			Regex regex = new Regex( "\r(?!\n)" );
  193. 			// replace each matching stray carriage return by a carriage return/linefeed pair
  194. 			text = regex.Replace( text, Environment.NewLine );
  195. 			return text;
  196. 		}
  197.  
  198.  
  199. 		static Encoding GetEncoding( string myencoding )
  200. 		{
  201. 			if ( string.IsNullOrEmpty( myencoding ) )
  202. 			{
  203. 				return null;
  204. 			}
  205. 			// Get a list of available encodings
  206. 			EncodingInfo[] encodings = Encoding.GetEncodings( );
  207. 			// Try correctly spelled encodings first
  208. 			foreach ( EncodingInfo encoding in encodings )
  209. 			{
  210. 				if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
  211. 				{
  212. 					return Encoding.GetEncoding( encoding.CodePage );
  213. 				}
  214. 			}
  215. 			// No direct match found, try again, ignoring dashes
  216. 			foreach ( EncodingInfo encoding in encodings )
  217. 			{
  218. 				if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
  219. 				{
  220. 					return Encoding.GetEncoding( encoding.CodePage );
  221. 				}
  222. 			}
  223. 			// Still no match, try codepages
  224. 			foreach ( EncodingInfo encoding in encodings )
  225. 			{
  226. 				if ( encoding.CodePage.ToString( ) == myencoding )
  227. 				{
  228. 					return Encoding.GetEncoding( encoding.CodePage );
  229. 				}
  230. 			}
  231. 			// Still no match, giving up
  232. 			return null;
  233. 		}
  234.  
  235.  
  236. 		static int ListEncodings( )
  237. 		{
  238. 			try
  239. 			{
  240. 				Console.Clear( );
  241. 			}
  242. 			catch
  243. 			{
  244. 				// Console.Clear( ) throws an IO exception if the output is redirected
  245. 			}
  246. 			int columnwidth = 8;
  247. 			EncodingInfo[] allencodings = Encoding.GetEncodings( );
  248. 			List<string> allencodingnames = new List<string>( );
  249. 			foreach ( EncodingInfo enc in allencodings )
  250. 			{
  251. 				allencodingnames.Add( enc.Name );
  252. 			}
  253. 			allencodingnames.Sort( );
  254. 			foreach ( string enc in allencodingnames )
  255. 			{
  256. 				columnwidth = Math.Max( columnwidth, enc.Length );
  257. 			}
  258. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "Encoding", "CodePage" );
  259. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "========", "========" );
  260. 			foreach ( string enc in allencodingnames )
  261. 			{
  262. 				Console.WriteLine( "{0,-" + columnwidth + "}   {1}", enc, GetEncoding( enc ).CodePage );
  263. 			}
  264. 			return 0;
  265. 		}
  266.  
  267.  
  268. 		static int ShowHelp( params string[] errmsg )
  269. 		{
  270. 			#region Error Message
  271.  
  272. 			if ( errmsg.Length > 0 )
  273. 			{
  274. 				List<string> errargs = new List<string>( errmsg );
  275. 				errargs.RemoveAt( 0 );
  276. 				Console.Error.WriteLine( );
  277. 				Console.ForegroundColor = ConsoleColor.Red;
  278. 				Console.Error.Write( "ERROR:\t" );
  279. 				Console.ForegroundColor = ConsoleColor.White;
  280. 				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
  281. 				Console.ResetColor( );
  282. 			}
  283.  
  284. 			#endregion Error Message
  285.  
  286.  
  287. 			#region Help Text
  288.  
  289. 			/*
  290. 			Docx2Txt.exe,  Version 1.01
  291. 			Return the plain text content of a Word .DOCX or OpenOffice .ODT file
  292. 			without requiring Word or OpenOffice
  293.  
  294. 			Usage:    Docx2Txt.exe  docfile  [ encoding | /D ]
  295.  
  296. 			or:       Docx2Txt.exe  /E
  297.  
  298. 			Where:    docfile       is the path of the file to be read (no wildcards,
  299. 			                        only .docx and .odt extension allowed)
  300. 			          encoding      is the output encoding, e.g. UTF-8 to preserve
  301. 			                        Unicode characters, or IBM437 to convert Unicode
  302. 			                        doublequotes to ASCII
  303. 			          /D            use the encoding specified in the document file
  304. 			          /E            list all available encodings
  305.  
  306. 			Notes:    If the specified encoding does not match any available encoding
  307. 			          name, the program will try again, ignoring dashes; if that does
  308. 			          not provide a match, the program will try matching the specified
  309. 			          encoding with the available encodings' codepages.
  310. 			          This program requires .NET 4.5.
  311. 			          Return code ("errorlevel") 1 in case of errors, 0 on success.
  312.  
  313. 			Written by Rob van der Woude
  314. 			https://www.robvanderwoude.com
  315. 			*/
  316.  
  317. 			#endregion Help Text
  318.  
  319.  
  320. 			#region Display Help Text
  321.  
  322. 			Console.Error.WriteLine( );
  323.  
  324. 			Console.Error.WriteLine( "Docx2Txt.exe,  Version {0}", progver );
  325.  
  326. 			Console.Error.WriteLine( "Return the plain text content of a Word .DOCX or OpenOffice .ODT file" );
  327.  
  328. 			Console.Error.WriteLine( "without requiring Word or OpenOffice" );
  329.  
  330. 			Console.Error.WriteLine( );
  331.  
  332. 			Console.Error.Write( "Usage:    " );
  333. 			Console.ForegroundColor = ConsoleColor.White;
  334. 			Console.Error.WriteLine( "Docx2Txt.exe  docfile  [ encoding | /D ]" );
  335. 			Console.ResetColor( );
  336.  
  337. 			Console.Error.WriteLine( );
  338.  
  339. 			Console.Error.Write( "or:       " );
  340. 			Console.ForegroundColor = ConsoleColor.White;
  341. 			Console.Error.WriteLine( "Docx2Txt.exe  /E" );
  342. 			Console.ResetColor( );
  343.  
  344. 			Console.Error.WriteLine( );
  345.  
  346. 			Console.Error.Write( "Where:    " );
  347. 			Console.ForegroundColor = ConsoleColor.White;
  348. 			Console.Error.Write( "docfile" );
  349. 			Console.ResetColor( );
  350. 			Console.Error.WriteLine( "       is the path of the file to be read (no wildcards," );
  351.  
  352. 			Console.Error.WriteLine( "                        only .docx and .odt extension allowed)" );
  353.  
  354. 			Console.ForegroundColor = ConsoleColor.White;
  355. 			Console.Error.Write( "          encoding" );
  356. 			Console.ResetColor( );
  357. 			Console.Error.Write( "      is the output encoding, e.g. " );
  358. 			Console.ForegroundColor = ConsoleColor.White;
  359. 			Console.Error.Write( "UTF-8" );
  360. 			Console.ResetColor( );
  361. 			Console.Error.WriteLine( " to preserve" );
  362.  
  363. 			Console.Error.Write( "                        Unicode characters, or " );
  364. 			Console.ForegroundColor = ConsoleColor.White;
  365. 			Console.Error.Write( "IBM437" );
  366. 			Console.ResetColor( );
  367. 			Console.Error.WriteLine( " to convert Unicode" );
  368.  
  369. 			Console.Error.WriteLine( "                        doublequotes to ASCII" );
  370.  
  371. 			Console.ForegroundColor = ConsoleColor.White;
  372. 			Console.Error.Write( "          /D" );
  373. 			Console.ResetColor( );
  374. 			Console.Error.WriteLine( "            use the encoding specified in the document file" );
  375.  
  376. 			Console.ForegroundColor = ConsoleColor.White;
  377. 			Console.Error.Write( "          /E" );
  378. 			Console.ResetColor( );
  379. 			Console.Error.WriteLine( "            list all available encodings" );
  380.  
  381. 			Console.Error.WriteLine( );
  382.  
  383. 			Console.Error.WriteLine( "Notes:    If the specified encoding does not match any available encoding" );
  384.  
  385. 			Console.Error.WriteLine( "          name, the program will try again, ignoring dashes; if that does" );
  386.  
  387. 			Console.Error.WriteLine( "          not provide a match, the program will try matching the specified" );
  388.  
  389. 			Console.Error.WriteLine( "          encoding with the available encodings' codepages." );
  390.  
  391. 			Console.Error.WriteLine( "          This program requires .NET 4.5." );
  392.  
  393. 			Console.Error.WriteLine( "          Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
  394.  
  395. 			Console.Error.WriteLine( );
  396.  
  397. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  398.  
  399. 			Console.Error.WriteLine( "https://www.robvanderwoude.com" );
  400.  
  401. 			#endregion Display Help Text
  402.  
  403.  
  404. 			return 1;
  405. 		}
  406. 	}
  407. }
  408.  

page last uploaded: 2021-01-27