############################################# # This script gets a materials data from the Materials Project # using the mp_api. # # Materials structures can be converted to ase format and saved as # ase json or .cif. The entire materials data accessed from MP # can be dumped as a json. Additionally, MP id, formula and materials properties # (not structures) can be written out in csv format. This can later be merged # with features to construct datafiles for ML. # # Prasenjit Sen, July 10, 2025. # ############################################ from mp_api.client import MPRester from pymatgen.io.ase import AseAtomsAdaptor from ase.io import write import pandas as pd import json import csv import sys with MPRester(api_key="8MkqN9ZpYqG8SwBk9zxeijl2e8BTXtv2") as mpr: #docs = mpr.summary.search(material_ids=["mp-7"]) list_of_available_fields = mpr.materials.summary.available_fields print('Available fields for materials summary', list_of_available_fields, '\n') absorption_list = mpr.materials.absorption print('Fields in absorption data', absorption_list, '\n') chem_environment = mpr.materials.chemenv print('Feilds in chemenv', chem_environment, '\n') docs = mpr.materials.summary.search(band_gap=(0.5,1.0),num_elements=(1,5),num_sites=(1,20),fields=["material_id", "formula_pretty", "nelements", "nsites",\ "structure","band_gap", "energy_above_hull",'formation_energy_per_atom']) docs_chemenv = mpr.chemenv.search(num_sites=(1,6), num_elements=(1,3), elements=["Si", "O"], fields=["nsites", "formula_pretty","chemsys"]) number_of_materials = len(docs) print('# materials=', number_of_materials) doc = docs[3] print('From summary',' ',doc.formula_pretty,' ',doc.nsites,' ',doc.nelements,'\n') doc = docs_chemenv[0] print('From chemenv',' ',doc.nsites,' ',doc.formula_pretty,' ',doc.chemsys) #print('Chemenv ', doc) #sys.exit() rows = [] rows.append(['MP id'] + ['formula'] + ['band_gap'] + ['formation_energy_per_atom'] + ['energy_above_hull']) for doc in docs: atoms = AseAtomsAdaptor.get_atoms(doc.structure) atoms.write(f"{doc.material_id}.json") #print(doc.formula_pretty, doc.band_gap,doc.energy_above_hull,end='\n') rows.append([doc.material_id] + [doc.formula_pretty] + [f"{doc.band_gap: .5f}"] + [f"{doc.formation_energy_per_atom: .5f}"] + [f"{doc.energy_above_hull: .5f}"]) with open('temp.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(rows) # The following part of the scripts reads in the .csv file as Pandas DataFrame, # splits the entries in the column 'MP id', reorders the rows, and writes it # back as csv. # This re-ordering is not essential because when merging two dataframes, they can # be merged by matching column entries if they both have columns with the # with the same id. In this example, both the materials data and feature # csv files have a column with id "MP id". So correct matching would happen # between features and materials properties even without this reordering. MP_df = pd.read_csv('temp.csv',sep=',') print('Dataframe shape=',MP_df.shape) # Now split the entries in the first column MP_df[['mp', 'id']] = MP_df['MP id'].str.split('-', expand=True) # Reorder the rows according to the values of ids in the new second column. MP_df.sort_values(by='id', inplace=True) # The existing dataframe is sorted. # Drop the first two columns: 'mp' and the 'id's MP_df.drop(['mp','id'], axis=1, inplace=True) # Writing to a csv without the material_id from MP MP_df.to_csv('MP_materials_data.csv',index=False) #index=False ensures that the row indices are not written. Only the dataframe is written. #for j in range(number_of_materials): # current_doc = docs[j] # mpid = current_doc.material_id # Convert documents to dictionaries # Save to JSON # with open(mpid+".json", "w") as f: # json.dump(current_doc.dict(), f, indent=2) #import pandas as pd # Convert to DataFrame #df = pd.DataFrame([doc.dict() for doc in docs]) # Save to CSV #df.to_csv("mp_1449.csv", index=False)