Initial commit
This commit is contained in:
commit
866fc4bb28
22 changed files with 3839 additions and 0 deletions
63
scripts/gen_meta.py
Normal file
63
scripts/gen_meta.py
Normal file
|
@ -0,0 +1,63 @@
|
|||
import json
|
||||
import cv2
|
||||
import pprint
|
||||
import os.path as osp
|
||||
from cdf_parser import CdfParser
|
||||
|
||||
print("""
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
.row {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
# flex-wrap: wrap;
|
||||
justify-content: start;
|
||||
align-items: center;
|
||||
overflow-x: auto;
|
||||
margin-bottom: 16px;
|
||||
align-items: start;
|
||||
}
|
||||
.column {
|
||||
flex: 25%;
|
||||
padding: 0 1px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
""")
|
||||
|
||||
|
||||
path = "https://internblob.blob.core.windows.net/v-lixinyang/canva-render-11.30/{}?sp=racwdli&st=2023-09-17T15:37:58Z&se=2023-12-31T23:37:58Z&spr=https&sv=2022-11-02&sr=c&sig=u%2FPbZ4fNttAPeLj0NEEpX0eIgFcjhot%2Bmy3iGd%2BCmxk%3D"
|
||||
with open("cdfs.json", "r") as f:
|
||||
for line in f:
|
||||
cdf = json.loads(line)
|
||||
id = cdf['rendered_folder']
|
||||
cdf_parser = CdfParser(cdf['content'], id)
|
||||
elements = cdf_parser.get_elements()
|
||||
print('<div class="row">')
|
||||
elements = [e for e in elements[::-1]]
|
||||
for index, element in enumerate(elements):
|
||||
name = 'full' if index == 0 else f"({index - 1})"
|
||||
element_text = json.dumps(element, indent=2).replace("\n", "<br/>").replace(" ", " "*2)
|
||||
print(f"""
|
||||
<div class="column">
|
||||
<img src="{path.format(id + f"-{name}.png")}" alt="image" style="width: 300px;">
|
||||
<br/>
|
||||
<img src="{path.format(id + f"-({index})-mask.png")}" alt="image" style="width: 300px;">
|
||||
<p style="word-wrap: break-word; max-height: 300px; max-width: 300px; overflow: auto;"> {element_text} </p>
|
||||
</div>
|
||||
""")
|
||||
print(f"""
|
||||
<div class="column">
|
||||
<img src="{path.format(id + f"-({len(elements) - 1}).png")}" alt="image" style="width: 300px;">
|
||||
<br/>
|
||||
<p style="word-wrap: break-word; max-height: 300px; overflow: auto;"> Background </p>
|
||||
</div>
|
||||
""")
|
||||
print('</div>')
|
||||
|
||||
print("""
|
||||
</body>
|
||||
</html>
|
||||
""")
|
22
scripts/jobgen.py
Normal file
22
scripts/jobgen.py
Normal file
File diff suppressed because one or more lines are too long
174
scripts/post_processing.py
Normal file
174
scripts/post_processing.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from collections import Counter
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from azure.storage.blob.aio import BlobServiceClient, download_blob_from_url
|
||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCursor, AsyncIOMotorCollection, AsyncIOMotorDatabase
|
||||
|
||||
AZUREBLOB_SAS_URL = "https://internblob.blob.core.windows.net/v-lixinyang/?sp=racwdli&st=2023-09-17T15:37:58Z&se=2023-12-31T23:37:58Z&spr=https&sv=2022-11-02&sr=c&sig=u%2FPbZ4fNttAPeLj0NEEpX0eIgFcjhot%2Bmy3iGd%2BCmxk%3D"
|
||||
CONTAINER = "canva-render-10.19"
|
||||
MONGODB_URI = "mongodb://localhost:27017/canva"
|
||||
|
||||
class BlobAsync(object):
|
||||
|
||||
async def readall(self, blob):
|
||||
blob_service_client = BlobServiceClient(AZUREBLOB_SAS_URL)
|
||||
async with blob_service_client:
|
||||
container_client = blob_service_client.get_container_client(CONTAINER)
|
||||
# async for bname in container_client.list_blob_names():
|
||||
# print(bname)
|
||||
blob_client = container_client.get_blob_client(blob)
|
||||
|
||||
if not await blob_client.exists():
|
||||
return None
|
||||
stream = await blob_client.download_blob()
|
||||
|
||||
return stream.readall()
|
||||
|
||||
async def open_image(self, blob: str):
|
||||
async with BlobServiceClient(AZUREBLOB_SAS_URL) as blob_service_client:
|
||||
container_client = blob_service_client.get_container_client(CONTAINER)
|
||||
# async for bname in container_client.list_blob_names():
|
||||
# print(bname)
|
||||
blob_client = container_client.get_blob_client(blob)
|
||||
|
||||
if not await blob_client.exists():
|
||||
return None
|
||||
stream = await blob_client.download_blob()
|
||||
|
||||
buf = np.frombuffer(await stream.readall(), dtype=np.uint8)
|
||||
image = cv2.imdecode(buf, cv2.IMREAD_COLOR)
|
||||
|
||||
await blob_client.close()
|
||||
await container_client.close()
|
||||
|
||||
return image
|
||||
|
||||
async def upload_image(self, blob, image):
|
||||
async with BlobServiceClient(AZUREBLOB_SAS_URL) as blob_service_client:
|
||||
# Instantiate a new ContainerClient
|
||||
container_client = blob_service_client.get_container_client(CONTAINER)
|
||||
blob_client = container_client.get_blob_client(blob)
|
||||
is_success, buffer = cv2.imencode('.png', image)
|
||||
|
||||
await blob_client.upload_blob(data=buffer.tobytes(), overwrite=True)
|
||||
await blob_client.close()
|
||||
await container_client.close()
|
||||
|
||||
async def get_mask(img1, img2):
|
||||
"""Assume img1 and img2 are exactly the same, except text areas
|
||||
"""
|
||||
try:
|
||||
diff = cv2.absdiff(img1, img2)
|
||||
except:
|
||||
raise ValueError("img1 and img2 are not the same size")
|
||||
mask = cv2.cvtColor(diff, cv2.COLOR_RGBA2GRAY)
|
||||
thresh, binmask= cv2.threshold(mask, 10, 255, cv2.THRESH_BINARY)
|
||||
return thresh, binmask
|
||||
|
||||
async def filter_mask_size(mask, thresh=0.4):
|
||||
non_zero_pixels = cv2.countNonZero(mask)
|
||||
total_pixels = mask.shape[0] * mask.shape[1]
|
||||
if non_zero_pixels > total_pixels * thresh:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
mask_filtered_count = Counter()
|
||||
async def process_cdf(blob: BlobAsync, collection, cdf):
|
||||
folder = cdf["rendered_folder"]
|
||||
async with asyncio.TaskGroup() as g:
|
||||
task1 = g.create_task(blob.open_image(f"{folder}/t=true.png"))
|
||||
task2 = g.create_task(blob.open_image(f"{folder}/t=false.png"))
|
||||
img1, img2 = task1.result(), task2.result()
|
||||
if img1 is None and img2 is None:
|
||||
mask_filtered_count["not found"] += 1
|
||||
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "not found both"}})
|
||||
return
|
||||
if img1 is None:
|
||||
mask_filtered_count["not found"] += 1
|
||||
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "not found t=true"}})
|
||||
return
|
||||
if img2 is None:
|
||||
mask_filtered_count["not found"] += 1
|
||||
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "not found t=false"}})
|
||||
return
|
||||
|
||||
try:
|
||||
binary_thresh, mask = await get_mask(img1, img2)
|
||||
except ValueError as e:
|
||||
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "size not match"}})
|
||||
mask_filtered_count["size not match"] += 1
|
||||
return
|
||||
|
||||
mask_filters = [
|
||||
(filter_mask_size, "mask too small")
|
||||
]
|
||||
tasks = list()
|
||||
|
||||
async with asyncio.TaskGroup() as g:
|
||||
for f, reason in mask_filters:
|
||||
tasks.append((g.create_task(f(mask)), reason))
|
||||
for task, reason in tasks:
|
||||
if task.result():
|
||||
mask_filtered_count[reason] += 1
|
||||
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": reason}, "$unset": {"last_fetched": -1}})
|
||||
return
|
||||
await blob.upload_image(f"{folder}/mask.png", mask)
|
||||
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": time.time()}})
|
||||
mask_filtered_count["success"] += 1
|
||||
|
||||
async def main():
|
||||
client = AsyncIOMotorClient(MONGODB_URI)
|
||||
db = client.get_database("canva")
|
||||
collection = db["cdf"]
|
||||
|
||||
logger = logging.getLogger('azure.mgmt.resource')
|
||||
logger.setLevel(logging.WARNING)
|
||||
blob = BlobAsync()
|
||||
|
||||
cdf_cursor: AsyncIOMotorCursor = collection.find({
|
||||
'$or': [
|
||||
{ '$and': [
|
||||
{ 'rendered_folder': { '$exists': True } },
|
||||
{ 'last_fetched': { '$gt': 1697688216 } },
|
||||
{ 'last_fetched': { '$lt': time.time() - 600 } },
|
||||
{ 'last_mask_render': { '$exists': False }}
|
||||
]},
|
||||
{ '$and': [
|
||||
{ 'last_fetched': {'$gt': 1697998932}},
|
||||
{ 'last_mask_render': { '$not': { '$gt': 0 } } }
|
||||
]}
|
||||
]}, batch_size=400)
|
||||
cdf_list = await cdf_cursor.to_list(length=200)
|
||||
await cdf_cursor.close()
|
||||
while cdf_list is not []:
|
||||
async with asyncio.TaskGroup() as g:
|
||||
taskset = set()
|
||||
for cdf in cdf_list:
|
||||
taskset.add(
|
||||
g.create_task(process_cdf(blob, collection, cdf))
|
||||
)
|
||||
await asyncio.sleep(10)
|
||||
cdf_cursor: AsyncIOMotorCursor = collection.find({
|
||||
'$or': [
|
||||
{ '$and': [
|
||||
{ 'rendered_folder': { '$exists': True } },
|
||||
{ 'last_fetched': { '$gt': 1697688216 } },
|
||||
{ 'last_fetched': { '$lt': time.time() - 600 } },
|
||||
{ 'last_mask_render': { '$exists': False }}
|
||||
]},
|
||||
{ '$and': [
|
||||
{ 'last_fetched': {'$gt': 1697998932}},
|
||||
{ 'last_mask_render': { '$not': { '$gt': 0 } } }
|
||||
]}
|
||||
]}, batch_size=400)
|
||||
cdf_list = await cdf_cursor.to_list(length=200)
|
||||
await cdf_cursor.close()
|
||||
print(mask_filtered_count)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
33
scripts/text_mask_from_all_mask.py
Normal file
33
scripts/text_mask_from_all_mask.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import argparse
|
||||
import json
|
||||
from re import L
|
||||
import cv2
|
||||
import pprint
|
||||
import os.path as osp
|
||||
from cdf_parser import CdfParser
|
||||
from threading import Lock
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
output_f = open("cdfs_with_masks.json", "a+")
|
||||
output_lock = Lock()
|
||||
|
||||
def get_text_mask(line):
|
||||
data = json.loads(line)
|
||||
cdf_parser = CdfParser(data['content'], data['rendered_folder'])
|
||||
elements = cdf_parser.get_texts()
|
||||
data["text_layer"] = elements
|
||||
del data["content"]
|
||||
output_f.write(json.dumps(data) + "\n")
|
||||
|
||||
|
||||
def main(cdf_file):
|
||||
with open(cdf_file, 'r') as f:
|
||||
for line in f:
|
||||
with ThreadPoolExecutor(max_workers=24) as executor:
|
||||
executor.submit(get_text_mask, line)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--cdf", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
main(args.cdf)
|
65
scripts/vis.py
Normal file
65
scripts/vis.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
import json
|
||||
from collections import Counter
|
||||
|
||||
SAS = "sp=racwdli&st=2023-09-17T15:37:58Z&se=2023-12-31T23:37:58Z&spr=https&sv=2022-11-02&sr=c&sig=u%2FPbZ4fNttAPeLj0NEEpX0eIgFcjhot%2Bmy3iGd%2BCmxk%3D" # e.g sp=r&st=...
|
||||
|
||||
with open("sample_cdfs.json", "r") as f:
|
||||
data = []
|
||||
for line in f:
|
||||
data.append(json.load(f))
|
||||
|
||||
print("""
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
.row {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
justify-content: start;
|
||||
align-items: center;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
.column {
|
||||
flex: 15%;
|
||||
padding: 0 16px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
""")
|
||||
|
||||
for index, item in enumerate(data):
|
||||
# NOTE: replace xxxxxxxxx with SAS!
|
||||
file = f"https://internblob.blob.core.windows.net/v-lixinyang/canva-render-11.30/{item['rendered_folder']}-()?{SAS}"
|
||||
if index % 1 == 0:
|
||||
print('<div class="row">')
|
||||
print(f"""
|
||||
<div class="column">
|
||||
<img src="{gt_file}" alt="image" style="max-width: 100%;">
|
||||
</div>
|
||||
<div class="column">
|
||||
<img src="{file}" alt="image" style="max-width: 100%;">
|
||||
</div>
|
||||
<div class="column">
|
||||
<img src="{if_file}" alt="image" style="max-width: 100%;">
|
||||
</div>
|
||||
<div class="column">
|
||||
<img src="{sdxl_file}" alt="image" style="max-width: 100%;">
|
||||
</div>
|
||||
""")
|
||||
if index % 1 == 0:
|
||||
print('</div>')
|
||||
print(f"""
|
||||
<div class="row">
|
||||
<p>Index: {index} <br>
|
||||
Category: {item["category"]} <br>
|
||||
{item["caption"]} <br>
|
||||
Tags: {item["tags"]} <br>
|
||||
Texts: {item["texts"]}</p>
|
||||
</div>
|
||||
""")
|
||||
|
||||
print("""
|
||||
</body>
|
||||
</html>
|
||||
""")
|
Loading…
Add table
Add a link
Reference in a new issue