Reflection files format

From CCP4 wiki
Revision as of 21:48, 14 July 2019 by Kay (talk | contribs) (→‎LCF)
Jump to navigation Jump to search

CCP4 reflection files are binary files that are usually read by routines from CCP4 libraries. However, sometimes one wants to read them with a different programming language or without linking to the libraries, and it may be fun to work out alternative ways.

MTZ

A standalone Fortran program readmtz.f90 to read two MTZ files (both with at least one amplitude and one phase column), and to calculate the correlation coefficient of their electron densities, is this:

! gfortran -C -fbacktrace readmtz.f90 -o readmtz

MODULE readmtz_mod
! minimal MTZ file reader - Kay Diederichs 11/2018 . According to
! http://www.ccp4.ac.uk/html/mtzformat.html which is not completely correct
! e.g. the reflection data do not start at byte 21=20+1, but at byte 81=4*20+1

      TYPE data
        REAL :: ampli=-1.
        REAL :: phase
      END TYPE data

CONTAINS

SUBROUTINE readmtz(file,nref,refdat,colnames)
      IMPLICIT NONE
      CHARACTER, INTENT(IN) :: file*80
      INTEGER, INTENT(OUT) :: nref
      REAL, ALLOCATABLE :: refdat(:,:)
      CHARACTER*30, ALLOCATABLE :: colnames(:)
! local variables
      INTEGER :: i,ncol,nbatch
      CHARACTER :: string*80,colnam(200)*30
      
      nref=-1
      OPEN(1,FILE=file,ACCESS='STREAM',ACTION='READ') 
! is this BIG_ENDIAN or LITTLE_ENDIAN? we know where first hkl triple is
      IF (ALLOCATED(refdat)) DEALLOCATE(refdat)
      IF (ALLOCATED(colnames)) DEALLOCATE(colnames)
      ALLOCATE(refdat(3,1))
      READ(1,POS=81) refdat
      IF (NINT(refdat(1,1))/=refdat(1,1).OR.NINT(refdat(2,1))/=refdat(2,1).OR.NINT(refdat(3,1))/=refdat(3,1)) THEN
        CLOSE(1)
        OPEN(1,FILE=string,ACCESS='STREAM',ACTION='READ',CONVERT='BIG_ENDIAN')
        PRINT *,'this file seems to be BIG_ENDIAN - was it written on SGI?'
      END IF
      DEALLOCATE(refdat)
      
! where is the header?
      READ(1,POS=5) i    ! PRINT*,'header location:',i
      READ(1,POS=4*i-3)  ! position before header
! read the header records, and print important ones to the screen
      i=0
      DO
        READ(1)string
        IF (string(1:4)=='END ') EXIT ! skip history information and batch orientations
        IF (string(1:8)=='COLUMN H') WRITE(*,'(a)') &
        '#  column name                        type               min               max  set'
        IF (string(1:4)=='CELL'.OR.string(1:6)=='SYMINF') WRITE(*,'(a80)') string
        IF (string(1:6)=='COLUMN') THEN
          i=i+1
          WRITE(*,'(i2,1x,a80)') i,string
          IF (i>200) STOP 'more than 200 columns? www.ccp4.ac.uk/html/mtzformat.html says 200!'
          colnam(i)=string(8:37)
        END IF
        IF (string(1:4)=='NCOL') THEN
          READ(string(5:),*) ncol,nref,nbatch
          PRINT*,'ncol, nref, nbatch=',ncol,nref,nbatch
        END IF
      END DO
      IF (nref==-1) STOP 'could not find NCOL record in header'
      IF (nbatch/=0) STOP 'nbatch must be 0'
      IF (i/=ncol) STOP 'column names and NCOL do not match'
      ALLOCATE(refdat(ncol,nref),colnames(ncol))
      colnames=colnam(:ncol)
      READ(1,POS=81)   ! position of reflection data
      DO i=1,nref      
        READ(1) refdat(:,i)
      END DO
      CLOSE(1)
END SUBROUTINE readmtz

SUBROUTINE copy_given_cols(file,col_ampli,col_phase,refdat,colnames,data3d)
      CHARACTER, INTENT(IN) :: file*80,col_phase*30,col_ampli*30
      REAL, ALLOCATABLE :: refdat(:,:)
      CHARACTER*30, ALLOCATABLE, INTENT(IN) :: colnames(:)
      TYPE(data), ALLOCATABLE :: data3d(:,:,:)
      INTEGER :: i,j,nref,h,k,l,hmin,kmin,lmin,hmax,kmax,lmax,icol_ampli,icol_phase
      
      icol_ampli=0
      DO i=1,SIZE(colnames)
        IF (colnames(i)==col_ampli) THEN
          IF (icol_ampli==0) THEN
            icol_ampli=i
          ELSE
            STOP 'the amplitude column occured more than once!'
          END IF
        END IF
      END DO
      icol_phase=0
      DO i=1,SIZE(colnames)
        IF (colnames(i)==col_phase) THEN
          IF (icol_phase==0) THEN
            icol_phase=i
          ELSE
            STOP 'the phase column occured more than once!'
          END IF
        END IF
      END DO
      WRITE(*,*)'indices of ampli ',TRIM(col_ampli),' and phase ',TRIM(col_phase),' columns:',icol_ampli,icol_phase
      IF (icol_ampli==0) STOP 'could not find ampli column'
      IF (icol_phase==0) STOP 'could not find phase column'
      nref=SIZE(refdat,DIM=2)
      hmin=MINVAL(NINT(refdat(1,:)))
      hmax=MAXVAL(NINT(refdat(1,:)))
      kmin=MINVAL(NINT(refdat(2,:)))
      kmax=MAXVAL(NINT(refdat(2,:)))
      lmin=MINVAL(NINT(refdat(3,:)))
      lmax=MAXVAL(NINT(refdat(3,:)))
      ALLOCATE(data3d(hmin:hmax,kmin:kmax,lmin:lmax))
      j=0
      DO i=1,nref
        h=NINT(refdat(1,i))
        k=NINT(refdat(2,i))
        l=NINT(refdat(3,i))
        IF (isnan(refdat(icol_phase,i)).OR.isnan(refdat(icol_ampli,i))) CYCLE
        j=j+1
        IF (refdat(icol_ampli,i)<0.) THEN
          PRINT*,i,refdat(icol_ampli,i)
          STOP 'error: amplitude <0 ?'
        END IF
        data3d(h,k,l)%phase=refdat(icol_phase,i)
        data3d(h,k,l)%ampli=refdat(icol_ampli,i)
      END DO
      PRINT*,'number with ampli=NaN or phase =NaN',nref-j
      PRINT*,''
END SUBROUTINE copy_given_cols

END MODULE readmtz_mod
      
PROGRAM main
! read two files, and calculate the correlation coefficient based on given columns
      USE readmtz_mod
      IMPLICIT NONE   
      REAL, PARAMETER :: deg2rad=ATAN(1.)/45.
      REAL, ALLOCATABLE :: refdat(:,:)
      CHARACTER :: file*80,arg*80,col_phase*30,col_ampli*30
      CHARACTER*30, ALLOCATABLE :: colnames(:)
      TYPE(data), ALLOCATABLE :: d1(:,:,:),d2(:,:,:)
      INTEGER :: i,j,nref,h,k,l,hmin,kmin,lmin,hmax,kmax,lmax
      REAL :: cc_zaehler,cc_nenner,delphi
      
      IF (COMMAND_ARGUMENT_COUNT()==0) STOP 'usage: readmtz file1 ampli_nam phase_nam file2 ampli_nam phase_nam'
      
! read first MTZ file
      CALL GET_COMMAND_ARGUMENT(1,file)  
      WRITE(*,*) TRIM(file)
      CALL readmtz(file,nref,refdat,colnames)
      CALL GET_COMMAND_ARGUMENT(2,col_ampli)  ! expects column name of amplitude
      CALL GET_COMMAND_ARGUMENT(3,col_phase)  ! expects column name of phase
      CALL copy_given_cols(file,col_ampli,col_phase,refdat,colnames,d1)
      
! read second MTZ file
      CALL GET_COMMAND_ARGUMENT(4,file)  
      WRITE(*,*) TRIM(file)
      CALL readmtz(file,nref,refdat,colnames)
      CALL GET_COMMAND_ARGUMENT(5,col_ampli)  ! expects column name of amplitude
      CALL GET_COMMAND_ARGUMENT(6,col_phase)  ! expects column name of phase
      CALL copy_given_cols(file,col_ampli,col_phase,refdat,colnames,d2)
      
! common array bounds
      hmin=MAX(LBOUND(d1,DIM=1),LBOUND(d2,DIM=1))
      hmax=MIN(UBOUND(d1,DIM=1),UBOUND(d2,DIM=1))
      kmin=MAX(LBOUND(d1,DIM=2),LBOUND(d2,DIM=2))
      kmax=MIN(UBOUND(d1,DIM=2),UBOUND(d2,DIM=2))
      lmin=MAX(LBOUND(d1,DIM=3),LBOUND(d2,DIM=3))
      lmax=MIN(UBOUND(d1,DIM=3),UBOUND(d2,DIM=3))
           
! do some calculation:
      cc_zaehler=0.
      cc_nenner=0.
      DO l=lmin,lmax
      DO k=kmin,kmax
      DO h=hmin,hmax
        IF (d1(h,k,l)%ampli<0.OR.d2(h,k,l)%ampli<0.) CYCLE
        delphi=ABS( d1(h,k,l)%phase - d2(h,k,l)%phase )
        IF (delphi>180.) delphi=360.-delphi
        IF (ABS(delphi)>180.) STOP 'assert error delphi'
        cc_zaehler = cc_zaehler + d1(h,k,l)%ampli**2 * COS(delphi*deg2rad)
        cc_nenner  = cc_nenner  + d1(h,k,l)%ampli**2 
      END DO
      END DO
      END DO
      IF (cc_nenner>0.) cc_zaehler = cc_zaehler/cc_nenner
      PRINT*,'cc=',cc_zaehler

END PROGRAM main

LCF

The CCP4 LCF (labelled column format) files were phased-out in the early 90's but some interesting data still lingers on half-inch tapes so, if they can be read, the LCF files can be converted to ASCII or other formats for safe-keeping, or further use. Jonathan Cooper has worked out how to read them, mainly using Python, and documents this.

A Fortran program (lcf_dump) to accomplish this is:

! reads cell from LCF file - Kay Diederichs 7/2019 . 
! TODO - determine ncol, offset from data in file 
! reading VAX format is easy with the ifort compiler since it understands VAX REAL format:
! ifort lcf_dump.f90 -o lcf_dump
IMPLICIT NONE

! LCF items
INTEGER(2) :: header(8)
INTEGER(2), ALLOCATABLE :: refdat(:)
REAL :: cell(6)
CHARACTER :: separator*12 ! length may depend on machine that wrote LCF file

! program variables
INTEGER :: i,ncol,offset
CHARACTER :: string*120

i=COMMAND_ARGUMENT_COUNT()
IF (i/=3) STOP 'usage: lcf_dump <name.LCF> ncol offset'
! read command line
CALL GET_COMMAND_ARGUMENT(1,string)  ! expects LCF filename on command line
! OPEN(CONVERT=...) options are documented at
! https://software.intel.com/en-us/fortran-compiler-developer-guide-and-reference-open-convert-specifier
OPEN(1,FILE=string,ACCESS='STREAM',ACTION='READ',CONVERT='VAXG')
WRITE(*,*) 'LCF file: ',TRIM(string)

CALL GET_COMMAND_ARGUMENT(2,string)  ! expects NCOL
READ(string,*) ncol
ALLOCATE(refdat(ncol))

CALL GET_COMMAND_ARGUMENT(3,string)  ! expects NCOL
READ(string,*) offset
WRITE(*,*) 'ncol, offset:',ncol,offset

! read header and cell
READ(1) header,separator   ! reads 8*2+12 bytes. ends after byte 28
WRITE(*,'(a,8(i0,1x))') ' header: ',header
DO i=1,6
  READ(1) cell(i),separator  ! reads 6*(4+12) bytes. ends after byte 124
END DO
WRITE(*,'(a,6f10.4)') ' cell:',cell

! read column labels and comment
string=''
DO i=1,(offset-124)/16
  READ(1)string(1+(i-1)*4:i*4),separator
END DO
WRITE(*,'(a)') TRIM(string)

! read reflections
READ(1) refdat(1:2) ! needed for positioning - what is their meaning?
DO 
  READ(1) refdat
  IF (refdat(1)==32767) EXIT
  WRITE(*,*) refdat
END DO

END