(view source code of word2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using Word = Microsoft.Office.Interop.Word;
namespace RobvanderWoude{internal class Word2Txt
{static string progver = "1.05";
static string plaintext = string.Empty;
static int Main( string[] args )
{int rc = 0;
string document = string.Empty;
bool success = false;
bool usexmlencoding = false;
string xmlencoding = string.Empty;
Encoding encoding = null;
#region Parse Command Lineif ( args.Length == 0 || args.Length > 2 )
{return ShowHelp( );
}foreach ( string arg in args )
{if ( arg[0] == '/' )
{if ( arg == "/?" )
{return ShowHelp( );
}else if ( arg.StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) )
{usexmlencoding = true;
}else if ( arg.ToUpper( ) == "/E" )
{return ListEncodings( );
} else {return ShowHelp( "Invalid command line switch {0}", arg );
} } else {if ( string.IsNullOrWhiteSpace( document ) )
{document = arg;
if ( !File.Exists( document ) )
{return ShowHelp( "File \"{0}\" not found", document );
} }else if ( encoding == null )
{encoding = GetEncoding( arg );
if ( encoding == null )
{return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
} } else {return ShowHelp( "Too many command line arguments" );
} } }if ( string.IsNullOrWhiteSpace( document ) )
{return ShowHelp( );
} #endregion Parse Command Line #region Extract Text // First try using Word if possibleif ( IsWordInstalled( ) )
{ // If Word is installed, this program can handle ANY document format that is recognized by Wordsuccess = ReadWordFile( document );
} // if Word isn't available or could not extract any text, try plan Bif ( !success || string.IsNullOrWhiteSpace( plaintext ) )
{rc = 1;
string ext = Path.GetExtension( document ).ToLower( );
if ( ext == ".doc" )
{success = ReadDocFile( document );
}else if ( ext == ".docx" || ext == ".odt" )
{success = ReadDocxOrOdtFile( document );
}else if ( ext == ".rtf" )
{success = ReadRTFFile( document );
}else if ( ext == ".wpd" )
{success = ReadWPDFile( document );
} else {return ShowHelp( "If Word is not installed or fails to extract text, this program can only handle .DOC, .DOCX, .ODT and .WPD files" );
} } #endregion Extract Text #region Cleanup Text and Display Resultif ( success && !string.IsNullOrWhiteSpace( plaintext ) )
{ // convert stray carriage returns to carriage return/linefeed pairsplaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) );
if ( usexmlencoding )
{encoding = GetEncoding( xmlencoding );
}if ( encoding == null )
{ // send text to console using default output encodingConsole.WriteLine( plaintext );
} else { // temporarily change output encoding and send text to consoleEncoding oldencoding = Console.OutputEncoding;
Console.OutputEncoding = encoding;
Console.WriteLine( plaintext );
Console.OutputEncoding = oldencoding;
} } else {rc = 2;
} #endregion Cleanup Text and Display Resultreturn rc;
}static string ConvertStrayCarriageReturns( string text )
{ // convert stray carriage returns to carriage return/linefeed pairs // search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)Regex regex = new Regex( "\r(?!\n)" );
// replace each matching stray carriage return by a carriage return/linefeed pairtext = regex.Replace( text, Environment.NewLine );
return text;
}static Encoding GetEncoding( string myencoding )
{if ( string.IsNullOrEmpty( myencoding ) )
{return null;
} // Get a list of available encodingsEncodingInfo[] encodings = Encoding.GetEncodings( );
// Try correctly spelled encodings firstforeach ( EncodingInfo encoding in encodings )
{if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
{return Encoding.GetEncoding( encoding.CodePage );
} } // No direct match found, try again, ignoring dashesforeach ( EncodingInfo encoding in encodings )
{if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
{return Encoding.GetEncoding( encoding.CodePage );
} } // Still no match, try codepagesforeach ( EncodingInfo encoding in encodings )
{if ( encoding.CodePage.ToString( ) == myencoding )
{return Encoding.GetEncoding( encoding.CodePage );
} } // Still no match, giving upreturn null;
}static bool IsWordInstalled( )
{ // Source: "How to Check Whether Word is Installed in the System or Not" by Tadit Dash // https://www.codeproject.com/Tips/689968/How-to-Check-Whether-Word-is-Installed-in-the-Systreturn ( Type.GetTypeFromProgID( "Word.Application" ) != null );
}static int ListEncodings( )
{ try {Console.Clear( );
} catch { // Console.Clear( ) throws an IO exception if the output is redirected }int columnwidth = 8;
EncodingInfo[] allencodings = Encoding.GetEncodings( );
List<string> allencodingnames = new List<string>( );
foreach ( EncodingInfo enc in allencodings )
{allencodingnames.Add( enc.Name );
}allencodingnames.Sort( );
foreach ( string enc in allencodingnames )
{columnwidth = Math.Max( columnwidth, enc.Length );
}Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" );
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" );
foreach ( string enc in allencodingnames )
{Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage );
}return 0;
}static bool ReadDocFile( string docfile )
{string doccontent = string.Empty;
try {StreamReader sr = new StreamReader( docfile, false );
doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) );
sr.Close( );
}catch ( IOException )
{ShowHelp( "Access to file \"{0}\" denied", docfile );
return false;
}if ( doccontent.Length == 0 )
{return false;
}if ( doccontent.Contains( "[Content_Types]" ) )
{doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) );
}Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" );
MatchCollection matches = regex.Matches( doccontent );
if ( matches.Count == 0 )
{return false;
}plaintext = string.Empty;
foreach ( Match match in matches )
{string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) );
if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) )
{plaintext += matchingtext + "\n";
} }return true;
}static bool ReadDocxOrOdtFile( string docfile )
{string contentfile;
string ext = Path.GetExtension( docfile ).ToLower( );
if ( ext == ".odt" ) // OpenOffice document
{contentfile = "content.xml";
}else if ( ext == ".docx" ) // MS Office document
{contentfile = "document.xml";
} else {return false;
}string tempfile = Path.GetTempFileName( );
string content = string.Empty;
bool success = false;
try { // Open document as ZIP file and extract the XML file containing the text contentusing ( ZipArchive archive = ZipFile.OpenRead( docfile ) )
{foreach ( ZipArchiveEntry entry in archive.Entries )
{if ( entry.Name.ToLower( ) == contentfile )
{entry.ExtractToFile( tempfile, true );
success = true;
} } } }catch ( IOException )
{ShowHelp( "Access to file \"{0}\" denied", docfile );
return false;
}if ( success )
{ // Read the text content from the extracted fileStreamReader sr = new StreamReader( tempfile );
content = sr.ReadToEnd( ).Trim( "\n\r\t ".ToCharArray( ) );
sr.Close( );
} // Delete the extracted fileFile.Delete( tempfile );
if ( success )
{ // The first 100 characters of the extracted XML usually contain its encoding; // this encoding will be used if the /D command line switch was usedRegex regex = new Regex( " encoding=\"([^\"]+)\"" );
string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value;
// insert newlines after headers, list items and paragraphsregex = new Regex( "</(text|w):(h|p)>" );
plaintext = regex.Replace( content, "\n\n" );
regex = new Regex( "<w:br/>" );
plaintext = regex.Replace( plaintext, "\n\n" );
// remove all XML tagsregex = new Regex( "<[^>]+>" );
plaintext = regex.Replace( plaintext, "" );
}return success;
}static bool ReadRTFFile( string rtffile )
{ // Use a hidden RichTextBox to convert RTF to plain text, by Wendy Zang // https://social.msdn.microsoft.com/Forums/vstudio/en-US/6e56af9b-d7d3-49f3-9ec4-80edde3fe54b/reading-modifying-rtf-files?forum=csharpgeneral#a64345e9-cfcb-43be-ab18-c08fae02cb2aRichTextBox rtbox = new RichTextBox( );
string rtftext = string.Empty;
try {rtftext = File.ReadAllText( rtffile );
rtbox.Rtf = rtftext;
plaintext = rtbox.Text;
}catch ( IOException )
{return false;
}return true;
}static bool ReadWordFile( string wordfile )
{Word.Application wordapp = new Word.Application( );
object savechanges = Word.WdSaveOptions.wdDoNotSaveChanges;
bool success = false;
try {wordapp.Visible = false;
Word.Document worddoc = wordapp.Documents.Open( wordfile );
wordapp.Selection.WholeStory( );
plaintext = worddoc.Content.Text;
worddoc.Close( ref savechanges );
success = true;
}catch ( Exception )
{success = false;
} finally {wordapp.Quit( ref savechanges );
}return success;
}static bool ReadWPDFile( string wpfile )
{string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 );
// Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader!Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase );
wpcontent = regex.Replace( wpcontent, "" );
plaintext = string.Empty;
// WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect // Modified for spaces, linefeeds and e acute by yours truly // More modifications are required for accented charactersbool skip = false;
int resume = -1;
foreach ( char c in wpcontent )
{int i = (int)c;
if ( !skip )
{if ( i == 63 || i == 128 || i == 160 || i == 65533 )
{plaintext += ' ';
}else if ( i >= 169 && i != 172 && i <= 174 )
{plaintext += '-';
}else if ( i == 10 || i == 13 || i == 208 )
{plaintext += Environment.NewLine;
}else if ( i >= 192 && i <= 236 )
{skip = true;
resume = i;
}else if ( i == 15 )
{plaintext += (char)233;
}else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) )
{ // control characters, ignore } else {plaintext += c;
} }else if ( skip && i == resume )
{skip = false;
resume = -1;
} }return !string.IsNullOrWhiteSpace( plaintext );
}static int ShowHelp( params string[] errmsg )
{ #region Help Text /* Word2Txt, Version 1.05 Extract plain text from a Word document and send it to the screen Usage: Word2Txt "wordfile" [ encoding | /D ] or: Word2Txt /E Where: wordfile is the path of the Word document to be read (no wildcards allowed) encoding force use of alternative encoding for plain text, e.g. UTF-8 to preserve accented characters or IBM437 to convert unicode quotes to ASCII /D use the encoding specified in the document file (for .DOCX and .ODT only, if Word isn't available) /E list all available encodings Notes: If a "regular" (MSI based) Microsoft Word (2007 or later) installation is detected, this program will use Word to read the text from the Word file, which may be ANY file format recognized by Word. If Word was already active when this program is started, any other opened document(s) will be left alone, and only the document opened by this program will be closed. If Word is not available, or if it encounters unreadable content (i.e. the file is corrupted), the text can still be extracted, but only from .DOC, .DOCX, .ODT, .RTF and .WPD files. If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 0 means Word encountered no errors and some text was extracted from the file; 1 means Word is not available or the file was corrupted; 2 means either command line errors or the program failed to extract any text. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Error Messageif ( errmsg.Length > 0 )
{List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
} #endregion Error Message #region Display Help TextConsole.Error.WriteLine( );
Console.Error.WriteLine( "Word2Txt, Version {0}", progver );
Console.Error.WriteLine( "Extract plain text from a Word document and send it to the screen" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Word2Txt \"wordfile\" [ encoding | /D ]" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "or: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Word2Txt /E" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "wordfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the Word document to be read" );
Console.Error.WriteLine( " (no wildcards allowed)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " encoding" );
Console.ResetColor( );
Console.Error.WriteLine( " force use of alternative encoding for plain" );
Console.Error.Write( " text, e.g. " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "UTF-8" );
Console.ResetColor( );
Console.Error.WriteLine( " to preserve accented characters" );
Console.Error.Write( " or " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "IBM437" );
Console.ResetColor( );
Console.Error.WriteLine( " to convert unicode quotes to ASCII" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /D" );
Console.ResetColor( );
Console.Error.WriteLine( " use the encoding specified in the document file" );
Console.Error.WriteLine( " (for .DOCX and .ODT only, if Word isn't available)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /E" );
Console.ResetColor( );
Console.Error.WriteLine( " list all available encodings" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Notes: If a \"regular\" (MSI based) Microsoft Word (2007 or later)" );
Console.Error.WriteLine( " installation is detected, this program will use Word to read the" );
Console.Error.WriteLine( " recognized text from the Word file, which may be ANY file format" );
Console.Error.WriteLine( " by Word." );
Console.Error.WriteLine( " If Word was already active when this program is started, any other" );
Console.Error.WriteLine( " opened document(s) will be left alone, and only the document opened" );
Console.Error.WriteLine( " by this program will be closed." );
Console.Error.WriteLine( " If Word is not available, or if it encounters unreadable content" );
Console.Error.WriteLine( " (i.e. the file is corrupted), the text can still be extracted, but" );
Console.Error.WriteLine( " only from .DOC, .DOCX, .ODT, .RTF and .WPD files." );
Console.Error.WriteLine( " If the specified encoding does not match any available encoding name," );
Console.Error.WriteLine( " the program will try again, ignoring dashes; if that does not provide" );
Console.Error.WriteLine( " a match, the program will try matching the specified encoding with" );
Console.Error.WriteLine( " the available encodings' codepages." );
Console.Error.WriteLine( " This program requires .NET 4.5." );
Console.Error.WriteLine( " Return code (\"errorlevel\") 0 means Word encountered no errors and" );
Console.Error.WriteLine( " some text was extracted from the file; 1 means Word is not available" );
Console.Error.WriteLine( " or the file was corrupted; 2 means either command line errors or the" );
Console.Error.WriteLine( " program failed to extract any text." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Textreturn 2;
} }}page last modified: 2025-10-11; loaded in 0.0193 seconds