using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Windows.Forms; using Word = Microsoft.Office.Interop.Word; namespace RobvanderWoude { internal class Word2Txt { static string progver = "1.05"; static string plaintext = string.Empty; static int Main( string[] args ) { int rc = 0; string document = string.Empty; bool success = false; bool usexmlencoding = false; string xmlencoding = string.Empty; Encoding encoding = null; #region Parse Command Line if ( args.Length == 0 || args.Length > 2 ) { return ShowHelp( ); } foreach ( string arg in args ) { if ( arg[0] == '/' ) { if ( arg == "/?" ) { return ShowHelp( ); } else if ( arg.StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) ) { usexmlencoding = true; } else if ( arg.ToUpper( ) == "/E" ) { return ListEncodings( ); } else { return ShowHelp( "Invalid command line switch {0}", arg ); } } else { if ( string.IsNullOrWhiteSpace( document ) ) { document = arg; if ( !File.Exists( document ) ) { return ShowHelp( "File \"{0}\" not found", document ); } } else if ( encoding == null ) { encoding = GetEncoding( arg ); if ( encoding == null ) { return ShowHelp( "Invalid encoding \"{0}\"", args[1] ); } } else { return ShowHelp( "Too many command line arguments" ); } } } if ( string.IsNullOrWhiteSpace( document ) ) { return ShowHelp( ); } #endregion Parse Command Line #region Extract Text // First try using Word if possible if ( IsWordInstalled( ) ) { // If Word is installed, this program can handle ANY document format that is recognized by Word success = ReadWordFile( document ); } // if Word isn't available or could not extract any text, try plan B if ( !success || string.IsNullOrWhiteSpace( plaintext ) ) { rc = 1; string ext = Path.GetExtension( document ).ToLower( ); if ( ext == ".doc" ) { success = ReadDocFile( document ); } else if ( ext == ".docx" || ext == ".odt" ) { success = ReadDocxOrOdtFile( document ); } else if ( ext == ".rtf" ) { success = ReadRTFFile( document ); } else if ( ext == ".wpd" ) { success = ReadWPDFile( document ); } else { return ShowHelp( "If Word is not installed or fails to extract text, this program can only handle .DOC, .DOCX, .ODT and .WPD files" ); } } #endregion Extract Text #region Cleanup Text and Display Result if ( success && !string.IsNullOrWhiteSpace( plaintext ) ) { // convert stray carriage returns to carriage return/linefeed pairs plaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) ); if ( usexmlencoding ) { encoding = GetEncoding( xmlencoding ); } if ( encoding == null ) { // send text to console using default output encoding Console.WriteLine( plaintext ); } else { // temporarily change output encoding and send text to console Encoding oldencoding = Console.OutputEncoding; Console.OutputEncoding = encoding; Console.WriteLine( plaintext ); Console.OutputEncoding = oldencoding; } } else { rc = 2; } #endregion Cleanup Text and Display Result return rc; } static string ConvertStrayCarriageReturns( string text ) { // convert stray carriage returns to carriage return/linefeed pairs // search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n) Regex regex = new Regex( "\r(?!\n)" ); // replace each matching stray carriage return by a carriage return/linefeed pair text = regex.Replace( text, Environment.NewLine ); return text; } static Encoding GetEncoding( string myencoding ) { if ( string.IsNullOrEmpty( myencoding ) ) { return null; } // Get a list of available encodings EncodingInfo[] encodings = Encoding.GetEncodings( ); // Try correctly spelled encodings first foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // No direct match found, try again, ignoring dashes foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, try codepages foreach ( EncodingInfo encoding in encodings ) { if ( encoding.CodePage.ToString( ) == myencoding ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, giving up return null; } static bool IsWordInstalled( ) { // Source: "How to Check Whether Word is Installed in the System or Not" by Tadit Dash // https://www.codeproject.com/Tips/689968/How-to-Check-Whether-Word-is-Installed-in-the-Syst return ( Type.GetTypeFromProgID( "Word.Application" ) != null ); } static int ListEncodings( ) { try { Console.Clear( ); } catch { // Console.Clear( ) throws an IO exception if the output is redirected } int columnwidth = 8; EncodingInfo[] allencodings = Encoding.GetEncodings( ); List allencodingnames = new List( ); foreach ( EncodingInfo enc in allencodings ) { allencodingnames.Add( enc.Name ); } allencodingnames.Sort( ); foreach ( string enc in allencodingnames ) { columnwidth = Math.Max( columnwidth, enc.Length ); } Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" ); Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" ); foreach ( string enc in allencodingnames ) { Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage ); } return 0; } static bool ReadDocFile( string docfile ) { string doccontent = string.Empty; try { StreamReader sr = new StreamReader( docfile, false ); doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) ); sr.Close( ); } catch ( IOException ) { ShowHelp( "Access to file \"{0}\" denied", docfile ); return false; } if ( doccontent.Length == 0 ) { return false; } if ( doccontent.Contains( "[Content_Types]" ) ) { doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) ); } Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" ); MatchCollection matches = regex.Matches( doccontent ); if ( matches.Count == 0 ) { return false; } plaintext = string.Empty; foreach ( Match match in matches ) { string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) ); if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) ) { plaintext += matchingtext + "\n"; } } return true; } static bool ReadDocxOrOdtFile( string docfile ) { string contentfile; string ext = Path.GetExtension( docfile ).ToLower( ); if ( ext == ".odt" ) // OpenOffice document { contentfile = "content.xml"; } else if ( ext == ".docx" ) // MS Office document { contentfile = "document.xml"; } else { return false; } string tempfile = Path.GetTempFileName( ); string content = string.Empty; bool success = false; try { // Open document as ZIP file and extract the XML file containing the text content using ( ZipArchive archive = ZipFile.OpenRead( docfile ) ) { foreach ( ZipArchiveEntry entry in archive.Entries ) { if ( entry.Name.ToLower( ) == contentfile ) { entry.ExtractToFile( tempfile, true ); success = true; } } } } catch ( IOException ) { ShowHelp( "Access to file \"{0}\" denied", docfile ); return false; } if ( success ) { // Read the text content from the extracted file StreamReader sr = new StreamReader( tempfile ); content = sr.ReadToEnd( ).Trim( "\n\r\t ".ToCharArray( ) ); sr.Close( ); } // Delete the extracted file File.Delete( tempfile ); if ( success ) { // The first 100 characters of the extracted XML usually contain its encoding; // this encoding will be used if the /D command line switch was used Regex regex = new Regex( " encoding=\"([^\"]+)\"" ); string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value; // insert newlines after headers, list items and paragraphs regex = new Regex( "" ); plaintext = regex.Replace( content, "\n\n" ); regex = new Regex( "" ); plaintext = regex.Replace( plaintext, "\n\n" ); // remove all XML tags regex = new Regex( "<[^>]+>" ); plaintext = regex.Replace( plaintext, "" ); } return success; } static bool ReadRTFFile( string rtffile ) { // Use a hidden RichTextBox to convert RTF to plain text, by Wendy Zang // https://social.msdn.microsoft.com/Forums/vstudio/en-US/6e56af9b-d7d3-49f3-9ec4-80edde3fe54b/reading-modifying-rtf-files?forum=csharpgeneral#a64345e9-cfcb-43be-ab18-c08fae02cb2a RichTextBox rtbox = new RichTextBox( ); string rtftext = string.Empty; try { rtftext = File.ReadAllText( rtffile ); rtbox.Rtf = rtftext; plaintext = rtbox.Text; } catch ( IOException ) { return false; } return true; } static bool ReadWordFile( string wordfile ) { Word.Application wordapp = new Word.Application( ); object savechanges = Word.WdSaveOptions.wdDoNotSaveChanges; bool success = false; try { wordapp.Visible = false; Word.Document worddoc = wordapp.Documents.Open( wordfile ); wordapp.Selection.WholeStory( ); plaintext = worddoc.Content.Text; worddoc.Close( ref savechanges ); success = true; } catch ( Exception ) { success = false; } finally { wordapp.Quit( ref savechanges ); } return success; } static bool ReadWPDFile( string wpfile ) { string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 ); // Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader! Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase ); wpcontent = regex.Replace( wpcontent, "" ); plaintext = string.Empty; // WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect // Modified for spaces, linefeeds and e acute by yours truly // More modifications are required for accented characters bool skip = false; int resume = -1; foreach ( char c in wpcontent ) { int i = (int)c; if ( !skip ) { if ( i == 63 || i == 128 || i == 160 || i == 65533 ) { plaintext += ' '; } else if ( i >= 169 && i != 172 && i <= 174 ) { plaintext += '-'; } else if ( i == 10 || i == 13 || i == 208 ) { plaintext += Environment.NewLine; } else if ( i >= 192 && i <= 236 ) { skip = true; resume = i; } else if ( i == 15 ) { plaintext += (char)233; } else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) ) { // control characters, ignore } else { plaintext += c; } } else if ( skip && i == resume ) { skip = false; resume = -1; } } return !string.IsNullOrWhiteSpace( plaintext ); } static int ShowHelp( params string[] errmsg ) { #region Help Text /* Word2Txt, Version 1.05 Extract plain text from a Word document and send it to the screen Usage: Word2Txt "wordfile" [ encoding | /D ] or: Word2Txt /E Where: wordfile is the path of the Word document to be read (no wildcards allowed) encoding force use of alternative encoding for plain text, e.g. UTF-8 to preserve accented characters or IBM437 to convert unicode quotes to ASCII /D use the encoding specified in the document file (for .DOCX and .ODT only, if Word isn't available) /E list all available encodings Notes: If a "regular" (MSI based) Microsoft Word (2007 or later) installation is detected, this program will use Word to read the text from the Word file, which may be ANY file format recognized by Word. If Word was already active when this program is started, any other opened document(s) will be left alone, and only the document opened by this program will be closed. If Word is not available, or if it encounters unreadable content (i.e. the file is corrupted), the text can still be extracted, but only from .DOC, .DOCX, .ODT, .RTF and .WPD files. If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 0 means Word encountered no errors and some text was extracted from the file; 1 means Word is not available or the file was corrupted; 2 means either command line errors or the program failed to extract any text. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Error Message if ( errmsg.Length > 0 ) { List errargs = new List( errmsg ); errargs.RemoveAt( 0 ); Console.Error.WriteLine( ); Console.ForegroundColor = ConsoleColor.Red; Console.Error.Write( "ERROR:\t" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) ); Console.ResetColor( ); } #endregion Error Message #region Display Help Text Console.Error.WriteLine( ); Console.Error.WriteLine( "Word2Txt, Version {0}", progver ); Console.Error.WriteLine( "Extract plain text from a Word document and send it to the screen" ); Console.Error.WriteLine( ); Console.Error.Write( "Usage: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "Word2Txt \"wordfile\" [ encoding | /D ]" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "or: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "Word2Txt /E" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "Where: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "wordfile" ); Console.ResetColor( ); Console.Error.WriteLine( " is the path of the Word document to be read" ); Console.Error.WriteLine( " (no wildcards allowed)" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " encoding" ); Console.ResetColor( ); Console.Error.WriteLine( " force use of alternative encoding for plain" ); Console.Error.Write( " text, e.g. " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "UTF-8" ); Console.ResetColor( ); Console.Error.WriteLine( " to preserve accented characters" ); Console.Error.Write( " or " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "IBM437" ); Console.ResetColor( ); Console.Error.WriteLine( " to convert unicode quotes to ASCII" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " /D" ); Console.ResetColor( ); Console.Error.WriteLine( " use the encoding specified in the document file" ); Console.Error.WriteLine( " (for .DOCX and .ODT only, if Word isn't available)" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " /E" ); Console.ResetColor( ); Console.Error.WriteLine( " list all available encodings" ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Notes: If a \"regular\" (MSI based) Microsoft Word (2007 or later)" ); Console.Error.WriteLine( " installation is detected, this program will use Word to read the" ); Console.Error.WriteLine( " recognized text from the Word file, which may be ANY file format" ); Console.Error.WriteLine( " by Word." ); Console.Error.WriteLine( " If Word was already active when this program is started, any other" ); Console.Error.WriteLine( " opened document(s) will be left alone, and only the document opened" ); Console.Error.WriteLine( " by this program will be closed." ); Console.Error.WriteLine( " If Word is not available, or if it encounters unreadable content" ); Console.Error.WriteLine( " (i.e. the file is corrupted), the text can still be extracted, but" ); Console.Error.WriteLine( " only from .DOC, .DOCX, .ODT, .RTF and .WPD files." ); Console.Error.WriteLine( " If the specified encoding does not match any available encoding name," ); Console.Error.WriteLine( " the program will try again, ignoring dashes; if that does not provide" ); Console.Error.WriteLine( " a match, the program will try matching the specified encoding with" ); Console.Error.WriteLine( " the available encodings' codepages." ); Console.Error.WriteLine( " This program requires .NET 4.5." ); Console.Error.WriteLine( " Return code (\"errorlevel\") 0 means Word encountered no errors and" ); Console.Error.WriteLine( " some text was extracted from the file; 1 means Word is not available" ); Console.Error.WriteLine( " or the file was corrupted; 2 means either command line errors or the" ); Console.Error.WriteLine( " program failed to extract any text." ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Written by Rob van der Woude" ); Console.Error.WriteLine( "https://www.robvanderwoude.com" ); #endregion Display Help Text return 2; } } }