From 0c4b3e15cf19cb6e4292eb5f653011154c7ddeb3 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Tue, 10 Jun 2025 20:18:08 +0100
Subject: [PATCH 1/7] fix and improve writeSTAR

---
 prody/proteins/starfile.py | 370 +++++++++++++++++++++++++++++++++----
 1 file changed, 334 insertions(+), 36 deletions(-)

diff --git a/prody/proteins/starfile.py b/prody/proteins/starfile.py
index 3320bc9b9..0fbf677db 100644
--- a/prody/proteins/starfile.py
+++ b/prody/proteins/starfile.py
@@ -18,7 +18,8 @@
 
 from .emdfile import parseEMD
 
-__all__ = ['parseSTAR', 'writeSTAR', 'parseImagesFromSTAR',
+__all__ = ['parseSTAR', 'writeSTAR', 'writeSTARStream',
+           'parseImagesFromSTAR',
            'StarDict', 'StarDataBlock', 'StarLoop', 
            'parseSTARSection']
 
@@ -457,7 +458,8 @@ def printData(self):
 
 
 def parseSTAR(filename, **kwargs):
-    """Returns a dictionary containing data parsed from a STAR file.
+    """Returns a dictionary of type :class:`.StarDict`
+    containing data parsed from a STAR file.
 
     :arg filename: a filename
         The .star extension can be omitted.
@@ -500,6 +502,32 @@ def parseSTAR(filename, **kwargs):
 
 
 def parseSTARLines(lines, **kwargs):
+    """Returns a dictionary of type :class:`~collections.OrderedDict`
+    containing data parsed from a STAR file along with a program string.
+
+    :arg lines: lines read from a file.
+    :type lines: list
+
+    :arg start: line number for starting
+        Default is **None**, meaning start at the beginning
+    :type start: int, None
+
+    :arg stop: line number for stopping
+        Default is **None**, meaning don't stop.
+    :type stop: int, None
+
+    :arg shlex: whether to use shlex for splitting lines so as to preserve quoted substrings
+        Default is **False**
+    :type shlex: bool
+
+    :arg start_field: field to look for to trigger stopping
+        Default is **None**, meaning don't stop.
+    :type start_field: str, None
+
+    :arg stop_field: field to look for to trigger stopping
+        Default is **None**, meaning don't stop.
+    :type stop_field: str, None
+    """
     start = kwargs.get('start', None)
     if start is None:
         start = 0
@@ -508,9 +536,216 @@ def parseSTARLines(lines, **kwargs):
     if stop is None:
         stop = len(lines)
 
+    start_field = kwargs.get('start_field', None)
+    stop_field = kwargs.get('stop_field', None)
+
     prog = kwargs.get('prog', None)
     shlex = kwargs.get('shlex', False)
 
+    if start_field is not None:
+        finalDictionary = OrderedDict()
+        currentLoop = -1
+        block_fieldCounter = 0
+        loop_fieldCounter = 0
+        active_fieldCounter = 0
+        dataItemsCounter = 0
+        lineNumber = 0
+        inLoop = False
+        inShortBlock = False
+        for line in lines[start:stop]:
+            if line.startswith('data_'):
+                currentDataBlock = line[5:].strip()
+                finalDictionary[currentDataBlock] = OrderedDict()
+                currentLoop = -1
+                inLoop = False
+                inShortBlock = False
+                startingBlock = True
+                block_fieldCounter = 0
+
+            elif line.startswith('loop_'):
+                currentLoop += 1
+                inLoop = True
+                inShortBlock = False
+                finalDictionary[currentDataBlock][currentLoop] = OrderedDict()
+                finalDictionary[currentDataBlock][currentLoop]['fields'] = OrderedDict()
+                finalDictionary[currentDataBlock][currentLoop]['data'] = OrderedDict()
+                loop_fieldCounter = 0
+
+            elif line.startswith('_') or line.startswith(' _'):
+                # This marks a field identifier
+                currentField = split(line.strip(), shlex=shlex)[0]
+                if currentField == start_field:
+                    break
+
+                if inLoop:
+                    # We expect to only have the field identifier and no data until after
+
+                    if (len(split(line.strip(), shlex=shlex)) == 1
+                        or len(split(line.strip(), shlex=shlex)) == 2):
+                        # This is what we expect for a data loop
+                        finalDictionary[currentDataBlock][
+                            currentLoop]['fields'][loop_fieldCounter] = currentField
+                        dataItemsCounter = 0
+                        loop_fieldCounter += 1
+
+                    else:
+                        # This is contrary to that so we leave the loop
+                        inLoop = False
+
+                        # We populate fields and data together, continuing the regular data block
+                        finalDictionary[currentDataBlock][
+                            'fields'][block_fieldCounter] = currentField
+                        finalDictionary[currentDataBlock][
+                            'data'][currentField] = split(line.strip(),
+                                                                                        shlex=shlex)[1]
+                        block_fieldCounter += 1
+
+                else:
+                    # Outside a loop, populate fields and data together in a regular data block
+                    if startingBlock:
+                        # Initialise the data block first
+                        finalDictionary[currentDataBlock]['fields'] = OrderedDict()
+                        finalDictionary[currentDataBlock]['data'] = OrderedDict()
+                        startingBlock = False
+                        block_fieldCounter = 0
+
+                    finalDictionary[currentDataBlock][
+                        'fields'][block_fieldCounter] = currentField
+
+                    if len(split(line.strip(), shlex=shlex)) > 1:
+                        # This is the usual behaviour so we can fill in the data from the rest of the line
+                        finalDictionary[currentDataBlock][
+                            'data'][currentField] = split(line.strip(),
+                                                                                        shlex=shlex)[1]
+                    else:
+                        # In this case, we will look for the data in a short block over the following line(s).
+                        # If a single field takes multiple lines, these lines start and end with a semi-colon.
+                        # We'll handle that in the data section.
+                        finalDictionary[currentDataBlock]['data'][currentField] = ''
+                        inShortBlock = True
+                        startingShortBlock = True
+
+                    block_fieldCounter += 1
+
+            elif line.strip() == '#':
+                inLoop = False
+                inShortBlock = False
+
+            elif line.strip() == '':
+                pass
+
+            elif inLoop:
+                # Here we handle the data part of the loop.
+                # Data outside a loop is handled in line with the fields above or in shortDataBlocks below.
+                if not inShortBlock and len(split(line, shlex=shlex)) == loop_fieldCounter:
+                    # This is the usual case where each entry in the line corresponds to a field
+                    finalDictionary[currentDataBlock][currentLoop][
+                        'data'][dataItemsCounter] = OrderedDict()
+                    active_fieldCounter = 0
+                    for fieldEntry in split(line.strip(), shlex=shlex):
+                        currentField = finalDictionary[currentDataBlock][
+                            currentLoop]['fields'][active_fieldCounter]
+                        if currentField == start_field:
+                            break
+                        finalDictionary[currentDataBlock][currentLoop][
+                            'data'][dataItemsCounter][currentField] = fieldEntry
+                        active_fieldCounter += 1
+                    dataItemsCounter += 1
+                else:
+                    # The data is now being broken across lines.
+                    if not inShortBlock:
+                        inShortBlock = True
+                        finalDictionary[currentDataBlock][currentLoop][
+                            'data'][dataItemsCounter] = OrderedDict()
+                        active_fieldCounter = 0
+                        if not line.startswith(';'):
+                            # Then we haven't got a split field and can treat fields as normal
+                            inSplitField = False
+                            for fieldEntry in split(line.strip(), shlex=shlex):
+                                currentField = finalDictionary[currentDataBlock][
+                                    currentLoop]['fields'][active_fieldCounter]
+                                if currentField == start_field:
+                                    break
+                                finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] = fieldEntry
+                                active_fieldCounter += 1
+                        else:
+                            # We have a single field split over many lines
+                            inSplitField = True
+                            currentField = finalDictionary[currentDataBlock][
+                                currentLoop]['fields'][active_fieldCounter]
+                            if currentField == start_field:
+                                break
+                            finalDictionary[currentDataBlock][currentLoop][
+                                'data'][dataItemsCounter][currentField] = line.strip() + ' '
+                    else:
+                        if not inSplitField:
+                            # check if we are entering one
+                            if line.startswith(';'):
+                                inSplitField = True
+                                currentField = finalDictionary[currentDataBlock][
+                                    currentLoop]['fields'][active_fieldCounter]
+                                if currentField == start_field:
+                                    break
+                                finalDictionary[currentDataBlock][currentLoop][
+                                    'data'][dataItemsCounter][currentField] = line.strip() + ' '
+                            else:
+                                # continue as normal
+                                for fieldEntry in split(line.strip(), shlex=shlex):
+                                    currentField = finalDictionary[currentDataBlock][
+                                        currentLoop]['fields'][active_fieldCounter]
+                                    if currentField == start_field:
+                                        break
+                                    finalDictionary[currentDataBlock][currentLoop][
+                                        'data'][dataItemsCounter][currentField] = fieldEntry
+                                    active_fieldCounter += 1
+                        else:
+                            finalDictionary[currentDataBlock][currentLoop][
+                                'data'][dataItemsCounter][currentField] += line.strip()
+                            if line.strip() == ';':
+                                # This marks the end of the split field
+                                inSplitField = False
+                                active_fieldCounter += 1
+                            else:
+                                # Prepare for the next line
+                                finalDictionary[currentDataBlock][currentLoop][
+                                    'data'][dataItemsCounter][currentField] += ' '
+
+                        if active_fieldCounter == loop_fieldCounter:
+                            inShortBlock = False
+                            dataItemsCounter += 1
+
+            elif inShortBlock:
+                # We can now append the data in the lines here.
+                finalDictionary[currentDataBlock]['data'][currentField] += line.strip()
+                if startingShortBlock:
+                    startingShortBlock = False
+                    if not line.startswith(';'):
+                        # We only expect one line if there's no semi-colon
+                        inShortBlock = False
+                    else:
+                        # Prepare for the next line
+                        finalDictionary[currentDataBlock]['data'][currentField] += ' '
+                else:
+                    if line.strip() == ';':
+                        # This marks the end of the field so we've filled it
+                        inShortBlock = False
+                    else:
+                        # Prepare for the next line
+                        finalDictionary[currentDataBlock]['data'][currentField] += ' '
+
+            elif line.startswith('#'):
+                if line.startswith('# XMIPP'):
+                    prog = 'XMIPP'
+
+            else:
+                raise TypeError('This file does not conform to the STAR file format. '
+                                'There is a problem with line {0}:\n {1}'.format(lineNumber,
+                                                                                line))
+
+            lineNumber += 1
+
+        start = lineNumber - 2
+
     finalDictionary = OrderedDict()
     currentLoop = -1
     block_fieldCounter = 0
@@ -520,6 +755,17 @@ def parseSTARLines(lines, **kwargs):
     lineNumber = 0
     inLoop = False
     inShortBlock = False
+
+    for line in lines[:4]:
+        if line.startswith('data_'):
+            currentDataBlock = line[5:].strip()
+            finalDictionary[currentDataBlock] = OrderedDict()
+            currentLoop = -1
+            inLoop = False
+            inShortBlock = False
+            startingBlock = True
+            block_fieldCounter = 0
+
     for line in lines[start:stop]:
         if line.startswith('data_'):
             currentDataBlock = line[5:].strip()
@@ -542,13 +788,17 @@ def parseSTARLines(lines, **kwargs):
         elif line.startswith('_') or line.startswith(' _'):
             # This marks a field identifier
             currentField = split(line.strip(), shlex=shlex)[0]
+            if currentField == stop_field:
+                break
 
             if inLoop:
                 # We expect to only have the field identifier and no data until after
 
-                if len(split(line.strip(), shlex=shlex)) == 1 or len(split(line.strip(), shlex=shlex)) == 2:
+                if (len(split(line.strip(), shlex=shlex)) == 1
+                    or len(split(line.strip(), shlex=shlex)) == 2):
                     # This is what we expect for a data loop
-                    finalDictionary[currentDataBlock][currentLoop]['fields'][loop_fieldCounter] = currentField
+                    finalDictionary[currentDataBlock][
+                        currentLoop]['fields'][loop_fieldCounter] = currentField
                     dataItemsCounter = 0
                     loop_fieldCounter += 1
 
@@ -557,8 +807,10 @@ def parseSTARLines(lines, **kwargs):
                     inLoop = False
 
                     # We populate fields and data together, continuing the regular data block
-                    finalDictionary[currentDataBlock]['fields'][block_fieldCounter] = currentField
-                    finalDictionary[currentDataBlock]['data'][currentField] = split(line.strip(),
+                    finalDictionary[currentDataBlock][
+                        'fields'][block_fieldCounter] = currentField
+                    finalDictionary[currentDataBlock][
+                        'data'][currentField] = split(line.strip(),
                                                                                     shlex=shlex)[1]
                     block_fieldCounter += 1
 
@@ -571,11 +823,13 @@ def parseSTARLines(lines, **kwargs):
                     startingBlock = False
                     block_fieldCounter = 0
 
-                finalDictionary[currentDataBlock]['fields'][block_fieldCounter] = currentField
+                finalDictionary[currentDataBlock][
+                    'fields'][block_fieldCounter] = currentField
 
                 if len(split(line.strip(), shlex=shlex)) > 1:
                     # This is the usual behaviour so we can fill in the data from the rest of the line
-                    finalDictionary[currentDataBlock]['data'][currentField] = split(line.strip(),
+                    finalDictionary[currentDataBlock][
+                        'data'][currentField] = split(line.strip(),
                                                                                     shlex=shlex)[1]
                 else:
                     # In this case, we will look for the data in a short block over the following line(s).
@@ -599,53 +853,77 @@ def parseSTARLines(lines, **kwargs):
             # Data outside a loop is handled in line with the fields above or in shortDataBlocks below.
             if not inShortBlock and len(split(line, shlex=shlex)) == loop_fieldCounter:
                 # This is the usual case where each entry in the line corresponds to a field
-                finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter] = OrderedDict()
+                finalDictionary[currentDataBlock][currentLoop][
+                    'data'][dataItemsCounter] = OrderedDict()
                 active_fieldCounter = 0
                 for fieldEntry in split(line.strip(), shlex=shlex):
-                    currentField = finalDictionary[currentDataBlock][currentLoop]['fields'][active_fieldCounter]
-                    finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] = fieldEntry
+                    currentField = finalDictionary[currentDataBlock][
+                        currentLoop]['fields'][active_fieldCounter]
+                    if currentField == stop_field:
+                        break
+                    finalDictionary[currentDataBlock][currentLoop][
+                        'data'][dataItemsCounter][currentField] = fieldEntry
                     active_fieldCounter += 1
                 dataItemsCounter += 1
             else:
                 # The data is now being broken across lines.
                 if not inShortBlock:
                     inShortBlock = True
-                    finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter] = OrderedDict()
+                    finalDictionary[currentDataBlock][currentLoop][
+                        'data'][dataItemsCounter] = OrderedDict()
                     active_fieldCounter = 0
                     if not line.startswith(';'):
                         # Then we haven't got a split field and can treat fields as normal
                         inSplitField = False
                         for fieldEntry in split(line.strip(), shlex=shlex):
-                            currentField = finalDictionary[currentDataBlock][currentLoop]['fields'][active_fieldCounter]
-                            finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] = fieldEntry
+                            currentField = finalDictionary[currentDataBlock][
+                                currentLoop]['fields'][active_fieldCounter]
+                            if currentField == stop_field:
+                                break
+                            finalDictionary[currentDataBlock][currentLoop][
+                                'data'][dataItemsCounter][currentField] = fieldEntry
                             active_fieldCounter += 1
                     else:
                         # We have a single field split over many lines
                         inSplitField = True
-                        currentField = finalDictionary[currentDataBlock][currentLoop]['fields'][active_fieldCounter]
-                        finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] = line.strip() + ' '
+                        currentField = finalDictionary[currentDataBlock][
+                            currentLoop]['fields'][active_fieldCounter]
+                        if currentField == stop_field:
+                            break
+                        finalDictionary[currentDataBlock][currentLoop][
+                            'data'][dataItemsCounter][currentField] = line.strip() + ' '
                 else:
                     if not inSplitField:
                         # check if we are entering one
                         if line.startswith(';'):
                             inSplitField = True
-                            currentField = finalDictionary[currentDataBlock][currentLoop]['fields'][active_fieldCounter]
-                            finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] = line.strip() + ' '
+                            currentField = finalDictionary[currentDataBlock][
+                                currentLoop]['fields'][active_fieldCounter]
+                            if currentField == stop_field:
+                                break
+                            finalDictionary[currentDataBlock][currentLoop][
+                                'data'][dataItemsCounter][currentField] = line.strip() + ' '
                         else:
                             # continue as normal
                             for fieldEntry in split(line.strip(), shlex=shlex):
-                                currentField = finalDictionary[currentDataBlock][currentLoop]['fields'][active_fieldCounter]
-                                finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] = fieldEntry
+                                currentField = finalDictionary[currentDataBlock][
+                                    currentLoop]['fields'][active_fieldCounter]
+                                if currentField == stop_field:
+                                    break
+                                finalDictionary[currentDataBlock][currentLoop][
+                                    'data'][dataItemsCounter][currentField] = fieldEntry
                                 active_fieldCounter += 1
                     else:
-                        finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] += line.strip()
+                        finalDictionary[currentDataBlock][currentLoop][
+                            'data'][dataItemsCounter][currentField] += line.strip()
                         if line.strip() == ';':
                             # This marks the end of the split field
                             inSplitField = False
                             active_fieldCounter += 1
                         else:
                             # Prepare for the next line
-                            finalDictionary[currentDataBlock][currentLoop]['data'][dataItemsCounter][currentField] += ' '
+                            finalDictionary[currentDataBlock][currentLoop][
+                                'data'][dataItemsCounter][currentField] += ' '
 
                     if active_fieldCounter == loop_fieldCounter:
                         inShortBlock = False
@@ -676,7 +954,8 @@ def parseSTARLines(lines, **kwargs):
 
         else:
             raise TypeError('This file does not conform to the STAR file format. '
-                            'There is a problem with line {0}:\n {1}'.format(lineNumber, line))
+                            'There is a problem with line {0}:\n {1}'.format(lineNumber,
+                                                                             line))
 
         lineNumber += 1
 
@@ -696,30 +975,49 @@ def writeSTAR(filename, starDict, **kwargs):
         field names and finally data.
     :type starDict: dict
 
+    kwargs can be given including the program style to follow (*prog*)
+    """
+    stream = open(filename, 'w')
+    writeSTARStream(stream, starDict, **kwargs)
+    stream.close()
+
+def writeSTARStream(stream, starDict, **kwargs):
+    """Writes a STAR file from a dictionary containing data
+    such as that parsed from a Relion STAR file.
+
+    :arg stream: a stream
+        The .star extension can be omitted.
+    :type stream: str
+
+    :arg starDict: a dictionary in STAR format
+        This should have nested entries starting with data blocks then loops/tables then
+        field names and finally data.
+    :type starDict: dict
+
     kwargs can be given including the program style to follow (*prog*)
     """
     prog=kwargs.get('prog', 'XMIPP')
+    writeDataBlockTitle = kwargs.get('writeDataBlockTitle', True)
 
-    star = open(filename, 'w')
+    for dataBlock in starDict:
+        dataBlockKey = dataBlock.getTitle()
+        if writeDataBlockTitle:
+            stream.write('\ndata_' + dataBlockKey + '\n')
 
-    for dataBlockKey in starDict:
-        star.write('\ndata_' + dataBlockKey + '\n')
-        for loopNumber in starDict[dataBlockKey]:
-            star.write('\nloop_\n')
+        for loop in starDict[dataBlockKey].loops:
+            loopNumber = int(loop.getTitle().split()[-1])
+            stream.write('\nloop_\n')
             for fieldNumber in starDict[dataBlockKey][loopNumber]['fields']:
                 if prog == 'XMIPP':
-                    star.write(' ')
-                star.write(starDict[dataBlockKey][loopNumber]['fields'][fieldNumber] + '\n')
+                    stream.write(' ')
+                stream.write(starDict[dataBlockKey][loopNumber]['fields'][fieldNumber] + '\n')
             for dataItemNumber in starDict[dataBlockKey][loopNumber]['data']:
                 if prog == 'XMIPP':
-                    star.write('\t')
+                    stream.write('\t')
                 for fieldNumber in starDict[dataBlockKey][loopNumber]['fields']:
                     currentField = starDict[dataBlockKey][loopNumber]['fields'][fieldNumber]
-                    star.write(starDict[dataBlockKey][loopNumber]['data'][dataItemNumber][currentField] + '\t')
-                star.write('\n')
-
-    star.close()
-    return
+                    stream.write(starDict[dataBlockKey][loopNumber]['data'][dataItemNumber][currentField] + '\t')
+                stream.write('\n')
 
 
 def parseImagesFromSTAR(particlesSTAR, **kwargs):

From e50bbe998f12b14a1d30522dc6f08869a5956f9d Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Tue, 10 Jun 2025 20:18:46 +0100
Subject: [PATCH 2/7] read and write full cif header star

---
 prody/proteins/ciffile.py   | 11 +++++++++--
 prody/proteins/cifheader.py | 17 ++++++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/prody/proteins/ciffile.py b/prody/proteins/ciffile.py
index 8d7b34343..3d3225c2a 100644
--- a/prody/proteins/ciffile.py
+++ b/prody/proteins/ciffile.py
@@ -15,7 +15,7 @@
 from prody import LOGGER, SETTINGS
 
 from .localpdb import fetchPDB
-from .starfile import parseSTARSection
+from .starfile import parseSTARSection, writeSTARStream
 from .cifheader import getCIFHeaderDict
 from .header import buildBiomolecules, assignSecstr
 
@@ -601,7 +601,14 @@ def writeMMCIF(filename, atoms, csets=None, autoext=True, **kwargs):
         filename += '.cif'
 
     structure = atoms.toBioPythonStructure(header=header, csets=csets)
+
+    filehandle = open(filename, 'w')
+    writeSTARStream(filehandle, header['starDict1'])
     io=MMCIFIO()
     io.set_structure(structure)
-    io.save(filename)
+    io.save(filehandle)
+    writeSTARStream(filehandle, header['starDict2'],
+                    writeDataBlockTitle=False)
+    filehandle.close()
+
     return filename
diff --git a/prody/proteins/cifheader.py b/prody/proteins/cifheader.py
index 8472097e9..eb9cef90d 100644
--- a/prody/proteins/cifheader.py
+++ b/prody/proteins/cifheader.py
@@ -13,7 +13,7 @@
 from .header import (Chemical, Polymer, DBRef, _PDB_DBREF,
                      cleanString)
 
-from .starfile import parseSTARSection
+from .starfile import parseSTARSection, parseSTARLines, StarDict
 
 __all__ = ['parseCIFHeader', 'getCIFHeaderDict']
 
@@ -167,6 +167,21 @@ def getCIFHeaderDict(stream, *keys):
         for poly in header.get('polymers', []):
             poly.pdbentry = pdbid
             header[poly.chid] = poly
+
+        filename = stream.name if hasattr(stream, 'name') else ''
+
+        parsingDict1, prog = parseSTARLines(lines, shlex=True,
+            stop_field='_atom_site.group_PDB')
+        header['starDict1'] = StarDict(
+            parsingDict1, prog, filename)
+
+        parsingDict2, prog = parseSTARLines(lines, shlex=True,
+            start_field='_atom_site.group_PDB')
+        starDict2 = StarDict(
+            parsingDict2, prog, filename)
+        starDict2[0].pop(0)
+        header['starDict2'] = starDict2
+
         return header
 
 

From b9429d94990849835ba315f25046fc9dd64da661 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Tue, 10 Jun 2025 20:35:05 +0100
Subject: [PATCH 3/7] fix biopython part

---
 prody/proteins/ciffile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prody/proteins/ciffile.py b/prody/proteins/ciffile.py
index 3d3225c2a..6a3d3e95b 100644
--- a/prody/proteins/ciffile.py
+++ b/prody/proteins/ciffile.py
@@ -600,13 +600,13 @@ def writeMMCIF(filename, atoms, csets=None, autoext=True, **kwargs):
     if autoext and not filename.lower().endswith('.cif'):
         filename += '.cif'
 
-    structure = atoms.toBioPythonStructure(header=header, csets=csets)
+    structure = atoms.toBioPythonStructure(csets=csets)
 
     filehandle = open(filename, 'w')
     writeSTARStream(filehandle, header['starDict1'])
     io=MMCIFIO()
     io.set_structure(structure)
-    io.save(filehandle)
+    io.save(filehandle, preserve_atom_numbering=True)
     writeSTARStream(filehandle, header['starDict2'],
                     writeDataBlockTitle=False)
     filehandle.close()

From 75291f1aea49cb9a70b244ce7bd7676858d5de4a Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Tue, 10 Jun 2025 21:05:37 +0100
Subject: [PATCH 4/7] more mmcif formatting

---
 prody/proteins/starfile.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/prody/proteins/starfile.py b/prody/proteins/starfile.py
index 0fbf677db..f3d475101 100644
--- a/prody/proteins/starfile.py
+++ b/prody/proteins/starfile.py
@@ -995,17 +995,48 @@ def writeSTARStream(stream, starDict, **kwargs):
     :type starDict: dict
 
     kwargs can be given including the program style to follow (*prog*)
+
+    :arg writeDataBlockTitle: whether to write hashes for empty lines
+        Default is **False**
+    :arg writeDataBlockTitle: bool
+
+    :arg writeHashes: whether to write hashes for empty lines
+        Default is **False** unless *prog* is **'mmcif'**
+    :arg writeHashes: bool
     """
     prog=kwargs.get('prog', 'XMIPP')
     writeDataBlockTitle = kwargs.get('writeDataBlockTitle', True)
+    writeHashes = kwargs.get('writeHashes', False)
+    if prog.lower() == 'mmcif':
+        writeHashes = True
 
     for dataBlock in starDict:
         dataBlockKey = dataBlock.getTitle()
         if writeDataBlockTitle:
             stream.write('\ndata_' + dataBlockKey + '\n')
 
+        if writeHashes:
+            stream.write('#')
+
+        old_prefix = ''
+        for key, value in starDict[dataBlockKey]['data'].items():
+            prefix = key.split('.')[0]
+            if prefix != old_prefix:
+                old_prefix = prefix
+                stream.write('\n')
+                if writeHashes:
+                    stream.write('#')
+            if value.startswith(';'):
+                if 'one_letter_code' in key:
+                    value = '\n'.join(value.split())
+                stream.write('\n%-45s\n%s\n;' % (key, value[:-2]))
+            else:
+                stream.write('\n%-45s%s' % (key, value))
+
         for loop in starDict[dataBlockKey].loops:
             loopNumber = int(loop.getTitle().split()[-1])
+            if writeHashes:
+                stream.write('#')
             stream.write('\nloop_\n')
             for fieldNumber in starDict[dataBlockKey][loopNumber]['fields']:
                 if prog == 'XMIPP':

From 3357099d5e39251a26557f2da789e291a7671b31 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Tue, 10 Jun 2025 21:51:30 +0100
Subject: [PATCH 5/7] more mmcif formatting

---
 prody/proteins/starfile.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/prody/proteins/starfile.py b/prody/proteins/starfile.py
index f3d475101..5ca3873ff 100644
--- a/prody/proteins/starfile.py
+++ b/prody/proteins/starfile.py
@@ -1047,7 +1047,36 @@ def writeSTARStream(stream, starDict, **kwargs):
                     stream.write('\t')
                 for fieldNumber in starDict[dataBlockKey][loopNumber]['fields']:
                     currentField = starDict[dataBlockKey][loopNumber]['fields'][fieldNumber]
-                    stream.write(starDict[dataBlockKey][loopNumber]['data'][dataItemNumber][currentField] + '\t')
+                    data = starDict[dataBlockKey][loopNumber]['data'][dataItemNumber][currentField]
+                    if len(data.split()) > 1:
+                        data = "'" + data + "'"
+                    if prog == 'XMIPP':
+                        sep = '\t'
+                    else:
+                        sep = ' '
+                    if currentField in ['_entity_poly_seq.num',
+                                        '_pdbx_poly_seq_scheme.seq_id',
+                                        '_pdbx_poly_seq_scheme.ndb_seq_num',
+                                        '_pdbx_poly_seq_scheme.pdb_seq_num']:
+                        stream.write('%-4s' % data)
+                    elif currentField in ['_pdbx_poly_seq_scheme.auth_seq_num',
+                                          '_pdbx_poly_seq_scheme.pdb_mon_id',
+                                          '_pdbx_poly_seq_scheme.auth_mon_id',
+                                          '_pdbx_poly_seq_scheme.mon_id']:
+                        stream.write('%-3s' % data)
+                    elif currentField == '_chem_comp.name':
+                        stream.write('%-15s' % data)
+                    elif currentField == '_chem_comp.formula':
+                        stream.write('%-16s' % data)
+                    elif currentField == '_chem_comp.type':
+                        stream.write('%-19s' % data)
+                    elif currentField == '_entity_src_gen.host_org_common_name':
+                        stream.write('\n')
+                        stream.write(data)
+                    else:
+                        stream.write(data)
+                    stream.write(sep)
+
                 stream.write('\n')
 
 

From f98c4b1461c3423c79d4c41743f1d617692cee65 Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Tue, 10 Jun 2025 22:39:54 +0100
Subject: [PATCH 6/7] fix for empty starDict2

---
 prody/proteins/ciffile.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/prody/proteins/ciffile.py b/prody/proteins/ciffile.py
index 6a3d3e95b..b3a4c58bd 100644
--- a/prody/proteins/ciffile.py
+++ b/prody/proteins/ciffile.py
@@ -603,12 +603,13 @@ def writeMMCIF(filename, atoms, csets=None, autoext=True, **kwargs):
     structure = atoms.toBioPythonStructure(csets=csets)
 
     filehandle = open(filename, 'w')
-    writeSTARStream(filehandle, header['starDict1'])
+    writeSTARStream(filehandle, header['starDict1'], prog='mmcif')
     io=MMCIFIO()
     io.set_structure(structure)
     io.save(filehandle, preserve_atom_numbering=True)
-    writeSTARStream(filehandle, header['starDict2'],
-                    writeDataBlockTitle=False)
+    if header['starDict2'][0].numEntries() > 0:
+        writeSTARStream(filehandle, header['starDict2'],
+                        writeDataBlockTitle=False, prog='mmcif')
     filehandle.close()
 
     return filename

From 796ebdbf3ce14091fee4733389864350f0e23d7d Mon Sep 17 00:00:00 2001
From: James Krieger <jamesmkrieger@gmail.com>
Date: Wed, 11 Jun 2025 15:40:35 +0100
Subject: [PATCH 7/7] more general fixes

---
 prody/proteins/starfile.py | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/prody/proteins/starfile.py b/prody/proteins/starfile.py
index 5ca3873ff..e50e86bd7 100644
--- a/prody/proteins/starfile.py
+++ b/prody/proteins/starfile.py
@@ -1019,19 +1019,35 @@ def writeSTARStream(stream, starDict, **kwargs):
             stream.write('#')
 
         old_prefix = ''
-        for key, value in starDict[dataBlockKey]['data'].items():
+        prefix_keys = OrderedDict([])
+        prefixes = []
+        non_loop_data = starDict[dataBlockKey]['data']
+        for key, value in non_loop_data.items():
             prefix = key.split('.')[0]
             if prefix != old_prefix:
+                prefixes.append(prefix)
                 old_prefix = prefix
-                stream.write('\n')
-                if writeHashes:
-                    stream.write('#')
-            if value.startswith(';'):
-                if 'one_letter_code' in key:
-                    value = '\n'.join(value.split())
-                stream.write('\n%-45s\n%s\n;' % (key, value[:-2]))
-            else:
-                stream.write('\n%-45s%s' % (key, value))
+                prefix_keys[prefix] = []
+            prefix_keys[prefix].append(key)
+
+        for prefix, keys in prefix_keys.items():
+            max_len = max([len(key) for key in keys])
+            if max_len < 31:
+                max_len = 31
+
+            stream.write('\n')
+            if writeHashes:
+                stream.write('#')
+
+            for key in keys:
+                value = non_loop_data[key]
+                if value.startswith(';'):
+                    if 'one_letter_code' in key:
+                        value = '\n'.join(value.split())
+                    stream.write('\n%-45s\n%s\n;' % (key, value[:-2]))
+                else:
+                    stream.write('\n%-45s %s'.replace(
+                        '45', str(max_len)) % (key, value))
 
         for loop in starDict[dataBlockKey].loops:
             loopNumber = int(loop.getTitle().split()[-1])