(view source code of docx2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Text;
using System.Text.RegularExpressions;
namespace RobvanderWoude{internal class Docx2Txt
{static string progver = "1.01";
static int Main( string[] args )
{string document = string.Empty;
string docext = string.Empty;
Encoding encoding = null;
bool usexmlencoding = false;
#region Parse Command Lineif ( args.Length == 0 || args.Length > 2 )
{return ShowHelp( );
}foreach ( string arg in args )
{if ( arg[0] == '/' )
{if ( arg == "/?" )
{return ShowHelp( );
}else if ( arg.ToUpper( ).StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) )
{usexmlencoding = true;
}else if ( arg.ToUpper( ) == "/E" )
{return ListEncodings( );
} else {return ShowHelp( "Invalid command line switch {0}", arg );
} } else {if ( string.IsNullOrWhiteSpace( document ) )
{document = arg;
if ( !File.Exists( document ) )
{return ShowHelp( "File \"{0}\" not found", document );
}docext = Path.GetExtension( document ).ToLower( );
if ( docext != ".docx" && docext != ".odt" )
{return ShowHelp( "This program can extract text from .DOCX and .ODT files only" );
} }else if ( encoding == null )
{encoding = GetEncoding( arg );
if ( encoding == null )
{return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
} } else {return ShowHelp( "Too many command line arguments" );
} } }if ( string.IsNullOrWhiteSpace( document ) )
{return ShowHelp( );
} #endregion Parse Command Line #region Extract Textstring tempfile = Path.GetTempFileName( );
string content = string.Empty;
bool success = false;
string contentfile = string.Empty;
if ( docext == ".odt" ) // OpenOffice document
{contentfile = "content.xml";
}else if ( docext == ".docx" ) // MS Office document
{contentfile = "document.xml";
}if ( !string.IsNullOrWhiteSpace( contentfile ) )
{ // Open document as ZIP file and extract the XML file containing the text contentusing ( ZipArchive archive = ZipFile.OpenRead( document ) )
{foreach ( ZipArchiveEntry entry in archive.Entries )
{if ( entry.Name.ToLower( ) == contentfile )
{entry.ExtractToFile( tempfile, true );
success = true;
} } } }if ( success )
{ // Read the text content from the extracted fileStreamReader sr = new StreamReader( tempfile );
content = sr.ReadToEnd( ).Trim( "\n\r\t ".ToCharArray( ) );
sr.Close( );
} // Delete the extracted fileFile.Delete( tempfile );
#endregion Extract Textif ( success )
{ // The first 100 characters of the extracted XML usually contain its encoding; // this encoding will be used if the /D command line switch was usedRegex regex = new Regex( " encoding=\"([^\"]+)\"" );
string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value;
#region Cleanup Text // insert newlines after headers, list items and paragraphsregex = new Regex( "</(text|w):(h|p)>" );
string plaintext = regex.Replace( content, "\n\n" );
regex = new Regex( "<w:br/>" );
plaintext = regex.Replace( plaintext, "\n\n" );
// remove all XML tagsregex = new Regex( "<[^>]+>" );
plaintext = regex.Replace( plaintext, "" );
// convert stray carriage returns to carriage return/linefeed pairsplaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) );
#endregion Cleanup Text #region Display Textif ( usexmlencoding )
{encoding = GetEncoding( xmlencoding );
}if ( encoding == null )
{ // send text to console using default output encodingConsole.WriteLine( plaintext );
} else { // temporarily change output encoding and send text to consoleEncoding oldencoding = Console.OutputEncoding;
Console.OutputEncoding = encoding;
Console.WriteLine( plaintext );
Console.OutputEncoding = oldencoding;
} #endregion Display Textreturn 0;
}return ShowHelp( "An error occurred while trying to read \"{0}\"", document );
}static string ConvertStrayCarriageReturns( string text )
{ // convert stray carriage returns to carriage return/linefeed pairs // search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)Regex regex = new Regex( "\r(?!\n)" );
// replace each matching stray carriage return by a carriage return/linefeed pairtext = regex.Replace( text, Environment.NewLine );
return text;
}static Encoding GetEncoding( string myencoding )
{if ( string.IsNullOrEmpty( myencoding ) )
{return null;
} // Get a list of available encodingsEncodingInfo[] encodings = Encoding.GetEncodings( );
// Try correctly spelled encodings firstforeach ( EncodingInfo encoding in encodings )
{if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
{return Encoding.GetEncoding( encoding.CodePage );
} } // No direct match found, try again, ignoring dashesforeach ( EncodingInfo encoding in encodings )
{if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
{return Encoding.GetEncoding( encoding.CodePage );
} } // Still no match, try codepagesforeach ( EncodingInfo encoding in encodings )
{if ( encoding.CodePage.ToString( ) == myencoding )
{return Encoding.GetEncoding( encoding.CodePage );
} } // Still no match, giving upreturn null;
}static int ListEncodings( )
{ try {Console.Clear( );
} catch { // Console.Clear( ) throws an IO exception if the output is redirected }int columnwidth = 8;
EncodingInfo[] allencodings = Encoding.GetEncodings( );
List<string> allencodingnames = new List<string>( );
foreach ( EncodingInfo enc in allencodings )
{allencodingnames.Add( enc.Name );
}allencodingnames.Sort( );
foreach ( string enc in allencodingnames )
{columnwidth = Math.Max( columnwidth, enc.Length );
}Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" );
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" );
foreach ( string enc in allencodingnames )
{Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage );
}return 0;
}static int ShowHelp( params string[] errmsg )
{ #region Error Messageif ( errmsg.Length > 0 )
{List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
} #endregion Error Message #region Help Text /* Docx2Txt.exe, Version 1.01 Return the plain text content of a Word .DOCX or OpenOffice .ODT file without requiring Word or OpenOffice Usage: Docx2Txt.exe docfile [ encoding | /D ] or: Docx2Txt.exe /E Where: docfile is the path of the file to be read (no wildcards, only .docx and .odt extension allowed) encoding is the output encoding, e.g. UTF-8 to preserve Unicode characters, or IBM437 to convert Unicode doublequotes to ASCII /D use the encoding specified in the document file /E list all available encodings Notes: If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 1 in case of errors, 0 on success. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Display Help TextConsole.Error.WriteLine( );
Console.Error.WriteLine( "Docx2Txt.exe, Version {0}", progver );
Console.Error.WriteLine( "Return the plain text content of a Word .DOCX or OpenOffice .ODT file" );
Console.Error.WriteLine( "without requiring Word or OpenOffice" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Docx2Txt.exe docfile [ encoding | /D ]" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "or: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Docx2Txt.exe /E" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "docfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the file to be read (no wildcards," );
Console.Error.WriteLine( " only .docx and .odt extension allowed)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " encoding" );
Console.ResetColor( );
Console.Error.Write( " is the output encoding, e.g. " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "UTF-8" );
Console.ResetColor( );
Console.Error.WriteLine( " to preserve" );
Console.Error.Write( " Unicode characters, or " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "IBM437" );
Console.ResetColor( );
Console.Error.WriteLine( " to convert Unicode" );
Console.Error.WriteLine( " doublequotes to ASCII" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /D" );
Console.ResetColor( );
Console.Error.WriteLine( " use the encoding specified in the document file" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /E" );
Console.ResetColor( );
Console.Error.WriteLine( " list all available encodings" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Notes: If the specified encoding does not match any available encoding" );
Console.Error.WriteLine( " name, the program will try again, ignoring dashes; if that does" );
Console.Error.WriteLine( " not provide a match, the program will try matching the specified" );
Console.Error.WriteLine( " encoding with the available encodings' codepages." );
Console.Error.WriteLine( " This program requires .NET 4.5." );
Console.Error.WriteLine( " Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Textreturn 1;
} }}page last modified: 2025-10-11; loaded in 0.0114 seconds