edit a translation file in blocks
I have a script that lets you filter certain blocks of a file (where a block can be delimited by two new lines) and put the said blocks into a separate file. You can then work on the new file created and then merge it back into the original.
This can be very useful if you have a PO file that is thousands of lines of code, but you only need to edit 20 blocks from it. Once you have the blocks you can easily pass it over to an AI to do the job for you, instead of waiting for the whole context of the original file to be written back to you.
Here is how it works:
# filter by msgstr ""
python tool.py --action split --input messages.po --output filtered.po --filter "msgstr \"\""
# merge the file back into a new file merged.po
python tool.py --action merge --input filtered.po --output merged.po --original messages.po
# override the original
cp merged.po messages.po
And here is the script:
#!/usr/bin/env python3
import os
import json
import argparse
class FileBlockProcessor:
def __init__(self, delimiter="\n\n", filter_criterion="msgstr \"\""):
self.delimiter = delimiter
self.filter_criterion = filter_criterion
self.block_map = {} # Maps indices to original blocks
self.filtered_indices = [] # Keeps track of which indices were filtered
self.line_ending = None # Store the original file's line ending
def detect_line_ending(self, content):
"""Detect the line ending used in the file"""
if '\r\n' in content:
return '\r\n' # Windows
else:
return '\n' # Unix/Linux/Mac
def split_file(self, input_file, output_file):
"""
Splits the input file into blocks, filters them, and writes filtered blocks to output file.
Also creates a metadata file to track original positions.
"""
try:
# Resolve symbolic links to get the real path
real_input_file = os.path.realpath(input_file)
# Open in binary mode first to detect line endings
with open(real_input_file, 'rb') as f:
binary_content = f.read()
# Convert to text for processing
content = binary_content.decode('utf-8')
# Detect and store line ending
self.line_ending = self.detect_line_ending(content)
# Split content by delimiter (with normalized line endings)
delimiter = self.delimiter.replace('\n', self.line_ending)
blocks = content.split(delimiter)
# Store all blocks with their indices
for i, block in enumerate(blocks):
self.block_map[i] = block
# Filter blocks based on criterion
filtered_blocks = []
for i, block in enumerate(blocks):
if self.filter_criterion in block:
filtered_blocks.append(block)
self.filtered_indices.append(i)
# Resolve output file path if it's a symbolic link
real_output_file = os.path.realpath(output_file)
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(real_output_file), exist_ok=True)
# Write filtered blocks to output file (binary mode to preserve line endings)
with open(real_output_file, 'wb') as f:
f.write(delimiter.join(filtered_blocks).encode('utf-8'))
# Save metadata for later merging
metadata_file = f"{real_output_file}.metadata"
with open(metadata_file, 'w', encoding='utf-8') as f:
json.dump({
"delimiter": self.delimiter,
"filtered_indices": self.filtered_indices,
"total_blocks": len(blocks),
"line_ending": "CRLF" if self.line_ending == '\r\n' else "LF",
"original_file": input_file, # Store original path including symlink
"real_input_file": real_input_file # Store resolved path
}, f)
print(f"Split file successfully. {len(filtered_blocks)} blocks extracted to {output_file}")
print(f"Metadata saved to {metadata_file}")
return True
except Exception as e:
print(f"Error splitting file: {e}")
return False
def merge_file(self, original_file, edited_file, output_file):
"""
Merges the edited blocks back into the original file.
"""
try:
# Resolve symbolic links to get real paths
real_original_file = os.path.realpath(original_file)
real_edited_file = os.path.realpath(edited_file)
real_output_file = os.path.realpath(output_file)
# Load metadata
metadata_file = f"{real_edited_file}.metadata"
if not os.path.exists(metadata_file):
raise FileNotFoundError(f"Metadata file {metadata_file} not found. Cannot merge without metadata.")
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
# Set line ending from metadata
self.line_ending = '\r\n' if metadata.get("line_ending") == "CRLF" else '\n'
# Read edited content (binary mode to preserve line endings)
with open(real_edited_file, 'rb') as f:
binary_content = f.read()
edited_content = binary_content.decode('utf-8')
# Normalize delimiter based on detected line ending
delimiter = metadata["delimiter"].replace('\n', self.line_ending)
# Split edited content into blocks
edited_blocks = edited_content.split(delimiter)
if len(edited_blocks) != len(metadata["filtered_indices"]):
raise ValueError(f"Number of edited blocks ({len(edited_blocks)}) doesn't match original filtered blocks ({len(metadata['filtered_indices'])})")
# Read original file if necessary (binary mode)
if real_original_file != real_output_file:
with open(real_original_file, 'rb') as f:
binary_content = f.read()
original_content = binary_content.decode('utf-8')
blocks = original_content.split(delimiter)
else:
# Reconstruct from block_map if we're overwriting the original
blocks = [self.block_map.get(i, "") for i in range(metadata["total_blocks"])]
# Replace filtered blocks with edited versions
for idx, original_idx in enumerate(metadata["filtered_indices"]):
if original_idx < len(blocks):
blocks[original_idx] = edited_blocks[idx]
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(real_output_file), exist_ok=True)
# Write the merged content back (binary mode to preserve line endings)
with open(real_output_file, 'wb') as f:
f.write(delimiter.join(blocks).encode('utf-8'))
print(f"Merged file successfully created at {output_file}")
return True
except Exception as e:
print(f"Error merging file: {e}")
return False
def main():
parser = argparse.ArgumentParser(description='Process files by splitting into blocks, filtering, and merging')
parser.add_argument('--action', required=True, choices=['split', 'merge'], help='Action to perform')
parser.add_argument('--input', required=True, help='Input file path')
parser.add_argument('--output', required=True, help='Output file path')
parser.add_argument('--delimiter', default='\n\n', help='Block delimiter (default: two newlines)')
parser.add_argument('--filter', default='msgstr ""', help='Filter criterion (default: msgstr "")')
parser.add_argument('--original', help='Original file path (required for merge action)')
parser.add_argument('--follow-symlinks', action='store_true', default=True,
help='Follow symbolic links (default: True)')
args = parser.parse_args()
processor = FileBlockProcessor(delimiter=args.delimiter, filter_criterion=args.filter)
if args.action == 'split':
processor.split_file(args.input, args.output)
elif args.action == 'merge':
if not args.original:
print("Error: --original is required for merge action")
return
processor.merge_file(args.original, args.input, args.output)
if __name__ == "__main__":
main()